From 84a5f0e2eb84bd02a09e00d30d888f162a49e84b Mon Sep 17 00:00:00 2001 From: Pablo Romero Date: Fri, 26 Aug 2022 11:44:11 +0200 Subject: [PATCH 001/154] Fixes #3743. --- common.h | 4 ++-- ctest/CMakeLists.txt | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common.h b/common.h index 00d1d0baf..e6002d322 100644 --- a/common.h +++ b/common.h @@ -90,7 +90,7 @@ extern "C" { #endif #include -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_QNX) #include #include #endif @@ -107,7 +107,7 @@ extern "C" { #endif #endif -#ifdef OS_HAIKU +#if defined(OS_HAIKU) || defined(OS_QNX) #define NO_SYSV_IPC #endif diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index e779fb168..91338b73b 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -40,7 +40,7 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() add_test(NAME "x${float_char}cblat1" @@ -65,7 +65,7 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() add_test(NAME "x${float_char}cblat2" @@ -90,7 +90,7 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() add_test(NAME "x${float_char}cblat3" From 1b1f781cf986376cb28020d6e5dab9c35b40919e Mon Sep 17 00:00:00 2001 From: Pablo Romero Date: Fri, 26 Aug 2022 11:45:23 +0200 Subject: [PATCH 002/154] Added name and details to contributors' list. --- CONTRIBUTORS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 1714d90c8..f5e9dda91 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -211,4 +211,5 @@ In chronological order: * PLCT Lab, Institute of Software Chinese Academy of Sciences * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. - \ No newline at end of file +* Pablo Romero + * [2022-08] Fix building from sources for QNX \ No newline at end of file From e15f810a023da3f93fbff9552182d07c94bb849f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Aug 2022 17:31:57 +0200 Subject: [PATCH 003/154] Avoid spurious version queries and associated expr errors in the NOFORTRAN case --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f14a8a8ff..d5e9cbfc7 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -143,6 +143,7 @@ ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -mavx2 endif endif +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) ifeq ($(F_COMPILER), GFORTRAN) # AVX2 support was added in 4.7.0 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) @@ -159,6 +160,7 @@ endif endif endif endif +endif endif From 68277282df4adaafaf9b4a01c2eeb629eed99528 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 30 Aug 2022 22:26:16 +0200 Subject: [PATCH 004/154] Work around XCode assembler SVE bug --- Makefile.arm64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index 4efa55286..2ef0caa8b 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -124,7 +124,11 @@ ifeq ($(CORE), NEOVERSEN2) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ9), 1) ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 +else +CCOMMON_OPT += -march=armv8.5-a+sve -mtune=neoverse-n2 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif From ae3bcc8949cfaa8f37cfba864971227dc972fd96 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Aug 2022 10:41:01 +0200 Subject: [PATCH 005/154] Drop NeoverseN2 to armv8.2-a on OSX to make it build with gcc11 too --- Makefile.arm64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 2ef0caa8b..480684422 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -127,7 +127,7 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else -CCOMMON_OPT += -march=armv8.5-a+sve -mtune=neoverse-n2 +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 From 594ceeceda042e18265289d78cb31cbf82e41fa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Thu, 1 Sep 2022 14:35:12 +0200 Subject: [PATCH 006/154] CI (MSYS2): Configure with `-DCMAKE_BUILD_TYPE=Release`. --- .github/workflows/dynamic_arch.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index c34b0c462..418250675 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -235,7 +235,8 @@ jobs: - name: Configure OpenBLAS run: | mkdir build && cd build - cmake -DBUILD_SHARED_LIBS=ON \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ -DBUILD_STATIC_LIBS=ON \ -DDYNAMIC_ARCH=ON \ -DUSE_THREAD=ON \ @@ -258,6 +259,7 @@ jobs: timeout-minutes: 60 run: cd build && ctest + cross_build: runs-on: ubuntu-22.04 From c4d7ce338412bcfc4c6143d307d5b9547cb26db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Thu, 1 Sep 2022 18:08:39 +0200 Subject: [PATCH 007/154] CI (MSYS2): Add one runner with `-DCMAKE_BUILD_TYPE=None`. --- .github/workflows/dynamic_arch.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 418250675..669e41059 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -150,6 +150,7 @@ jobs: matrix: msystem: [MINGW64, MINGW32, CLANG64] idx: [int32, int64] + build-type: [Release] include: - msystem: MINGW64 idx: int32 @@ -173,6 +174,11 @@ jobs: idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 c-lapack-flags: -DC_LAPACK=ON + - msystem: MINGW64 + idx: int32 + target-prefix: mingw-w64-x86_64 + fc-pkg: mingw-w64-x86_64-gcc-fortran + build-type: None exclude: - msystem: MINGW32 idx: int64 @@ -215,11 +221,11 @@ jobs: path: C:/msys64/home/runneradmin/.ccache # We include the commit sha in the cache key, as new cache entries are # only created if there is no existing entry for the key yet. - key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }} + key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} # Restore a matching ccache cache entry. Prefer same branch. restore-keys: | - ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }} - ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }} + ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} + ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }} - name: Configure ccache # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. @@ -235,7 +241,7 @@ jobs: - name: Configure OpenBLAS run: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release \ + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \ -DBUILD_SHARED_LIBS=ON \ -DBUILD_STATIC_LIBS=ON \ -DDYNAMIC_ARCH=ON \ From 41e51dbc1b8a334394e425cd9e70757aac9c0cb6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 2 Sep 2022 13:07:51 +0200 Subject: [PATCH 008/154] add target for mips xbuild --- .github/workflows/dynamic_arch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index c34b0c462..1814e9e56 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -267,7 +267,7 @@ jobs: include: - target: mips64el triple: mips64el-linux-gnuabi64 - opts: DYNAMIC_ARCH=1 + opts: DYNAMIC_ARCH=1 TARGET=GENERIC - target: riscv64 triple: riscv64-linux-gnu opts: TARGET=RISCV64_GENERIC From 992a9222ffe7dd78fdad5ff1e0e32d11e9469d5a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Sep 2022 09:56:25 +0200 Subject: [PATCH 009/154] Move all Apple jobs on Azure to macos-11 following deprecation --- azure-pipelines.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1102bf0f5..67a343d8a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -141,7 +141,7 @@ jobs: - job: OSX_OpenMP pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -151,7 +151,7 @@ jobs: - job: OSX_GCC_Nothreads pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -159,7 +159,7 @@ jobs: - job: OSX_OpenMP_Clang pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -172,7 +172,7 @@ jobs: - job: OSX_OpenMP_Clang_cmake pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -188,7 +188,7 @@ jobs: - job: OSX_dynarch_cmake pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -202,7 +202,7 @@ jobs: - job: OSX_Ifort_Clang pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg @@ -235,7 +235,7 @@ jobs: - job: OSX_NDK_ARMV7 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -255,7 +255,7 @@ jobs: - job: OSX_IOS_ARMV7 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 From 739c3c44a77d87d1b08de59bf868250683c0f755 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Sep 2022 15:01:22 +0200 Subject: [PATCH 010/154] Work around windows/osx gcc12 x86_64 tree-optimizer problem and add an osx/gcc12 build to Azure CI (#3745) Add pragma to disable the gcc tree-optimizer for some x86_64 S and Z kernels with gcc12 on OSX or Windows --- azure-pipelines.yml | 8 ++++++++ kernel/x86_64/sgemv_n_4.c | 5 ++++- kernel/x86_64/sgemv_t_4.c | 5 ++++- kernel/x86_64/ssymv_L.c | 5 ++++- kernel/x86_64/ssymv_U.c | 5 ++++- kernel/x86_64/zdot.c | 4 +++- kernel/x86_64/zgemv_n_4.c | 9 +++------ kernel/x86_64/zgemv_t_4.c | 4 +++- 8 files changed, 33 insertions(+), 12 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1102bf0f5..8236c6cc3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,6 +157,14 @@ jobs: brew update make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 +- job: OSX_GCC12 + pool: + vmImage: 'macOS-latest' + steps: + - script: | + brew update + make CC=gcc-12 FC=gfortran-12 + - job: OSX_OpenMP_Clang pool: vmImage: 'macOS-10.15' diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 621ddc622..c9681fa8b 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 0be2c7e97..07aa51503 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 29d6a9958..45914daf5 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 02bbc1c64..26e5ca7e9 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_U_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index c52575d07..27397ccfa 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 2d6866a78..8fc960610 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -25,10 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_n_microk_haswell-4.c" @@ -231,10 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT xbuffer[8],*ybuffer; -#if 0 -printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y); -#endif - if ( m < 1 ) return(0); if ( n < 1 ) return(0); diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index c2791e0f3..63c8b11a4 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" From 389e378063720639447757a20cd61274521e0573 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Sep 2022 09:01:03 +0200 Subject: [PATCH 011/154] Remove excessive quoting of arguments from PR3722 --- Makefile.prebuild | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index 5dd7dfa4e..0be4f1274 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -60,9 +60,9 @@ all: getarch_2nd ./getarch_2nd 1 >> $(TARGET_CONF) $(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch - ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" + ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS) ifneq ($(ONLY_CBLAS), 1) - ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" "$(TARGET_FLAGS)" + ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS) else #When we only build CBLAS, we set NOFORTRAN=2 echo "NOFORTRAN=2" >> $(TARGET_MAKE) @@ -77,8 +77,8 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_AVX512); \ - rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_RV64GV); \ + avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ + rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy From 365936ae1b1dfa2f50b3e65c68ae95babc6f2af2 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 13 Sep 2022 16:38:01 +0800 Subject: [PATCH 012/154] MIPS64: Using the macro MTC rather than MTC1 --- kernel/mips64/dnrm2.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index 0ccc781e1..cd40414a2 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -90,7 +90,7 @@ //Init INF lui TEMP, 0x7FF0 dsll TEMP, TEMP, 32 - MTC1 TEMP, INF + MTC TEMP, INF LD a1, 0 * SIZE(X) daddiu N, N, -1 From 23d59baaf19094a9e70721f4549c78f0a1c2f9a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Sep 2022 22:39:27 +0200 Subject: [PATCH 013/154] Add -mfma to -mavx2 for Apple clang, and set AVX2 options for Zen as well --- kernel/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index cbe4cde6e..977886044 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) - AVX2OPT = -mavx2 + AVX2OPT = -mavx2 -mfma endif endif ifdef NO_AVX2 @@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), ZEN) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else ifeq ($(TARGET_CORE), LOONGSON3R4) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else From a0a4f7c44759e9e4705f0fb1e42d8c8c7c0c68b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Sep 2022 22:47:00 +0200 Subject: [PATCH 014/154] Add -mfma to -mavx2 for clang, and add AVX2 declaration for Zen in DYNAMIC_ARCH builds --- cmake/system.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index a9fc0f4b7..fd68f79d6 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -197,14 +197,14 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() - if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) + if ((${TARGET} STREQUAL HASWELL OR ${TARGET} STREQUAL ZEN) AND NOT NO_AVX2) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma") endif() endif() if (DEFINED HAVE_AVX) From 515cf269291bec0d43651fe7bf99a71fb074a0ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Sep 2022 11:48:36 +0200 Subject: [PATCH 015/154] Fix pointer/integer argument mismatch in calls to pow() --- lapack-netlib/SRC/claed0.c | 4 ++-- lapack-netlib/SRC/claed7.c | 4 ++-- lapack-netlib/SRC/clalsa.c | 6 +++--- lapack-netlib/SRC/cstedc.c | 4 ++-- lapack-netlib/SRC/dlaed0.c | 4 ++-- lapack-netlib/SRC/dlaed7.c | 4 ++-- lapack-netlib/SRC/dlaeda.c | 8 ++++---- lapack-netlib/SRC/dlalsa.c | 6 +++--- lapack-netlib/SRC/dlasd0.c | 2 +- lapack-netlib/SRC/dlasda.c | 4 ++-- lapack-netlib/SRC/dstedc.c | 4 ++-- lapack-netlib/SRC/slaed0.c | 4 ++-- lapack-netlib/SRC/slaed7.c | 4 ++-- lapack-netlib/SRC/slaeda.c | 8 ++++---- lapack-netlib/SRC/slalsa.c | 6 +++--- lapack-netlib/SRC/slasd0.c | 2 +- lapack-netlib/SRC/slasda.c | 4 ++-- lapack-netlib/SRC/sstedc.c | 4 ++-- lapack-netlib/SRC/zlaed0.c | 4 ++-- lapack-netlib/SRC/zlaed7.c | 4 ++-- lapack-netlib/SRC/zlalsa.c | 6 +++--- lapack-netlib/SRC/zstedc.c | 4 ++-- 22 files changed, 50 insertions(+), 50 deletions(-) diff --git a/lapack-netlib/SRC/claed0.c b/lapack-netlib/SRC/claed0.c index 21e408397..2b696508e 100644 --- a/lapack-netlib/SRC/claed0.c +++ b/lapack-netlib/SRC/claed0.c @@ -796,10 +796,10 @@ L10: temp = log((real) (*n)) / log(2.f); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/claed7.c b/lapack-netlib/SRC/claed7.c index 49fc9ed4b..1eaa7e9c2 100644 --- a/lapack-netlib/SRC/claed7.c +++ b/lapack-netlib/SRC/claed7.c @@ -864,11 +864,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/clalsa.c b/lapack-netlib/SRC/clalsa.c index 4bc3830a9..2ef3e1231 100644 --- a/lapack-netlib/SRC/clalsa.c +++ b/lapack-netlib/SRC/clalsa.c @@ -1051,7 +1051,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -1065,7 +1065,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1110,7 +1110,7 @@ L170: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/cstedc.c b/lapack-netlib/SRC/cstedc.c index 437c39e96..8f047d1ce 100644 --- a/lapack-netlib/SRC/cstedc.c +++ b/lapack-netlib/SRC/cstedc.c @@ -836,10 +836,10 @@ f"> */ lrwmin = *n - 1 << 1; } else if (icompz == 1) { lgn = (integer) (log((real) (*n)) / log(2.f)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } lwmin = *n * *n; diff --git a/lapack-netlib/SRC/dlaed0.c b/lapack-netlib/SRC/dlaed0.c index 95e39b0df..74e58dd2d 100644 --- a/lapack-netlib/SRC/dlaed0.c +++ b/lapack-netlib/SRC/dlaed0.c @@ -827,10 +827,10 @@ L10: temp = log((doublereal) (*n)) / log(2.); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/dlaed7.c b/lapack-netlib/SRC/dlaed7.c index fd8515261..d23a72be0 100644 --- a/lapack-netlib/SRC/dlaed7.c +++ b/lapack-netlib/SRC/dlaed7.c @@ -885,11 +885,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/dlaeda.c b/lapack-netlib/SRC/dlaeda.c index f4bb214d3..202e1b636 100644 --- a/lapack-netlib/SRC/dlaeda.c +++ b/lapack-netlib/SRC/dlaeda.c @@ -754,7 +754,7 @@ f"> */ /* scheme */ i__1 = *curlvl - 1; - curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1; + curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1; /* Determine size of these matrices. We add HALF to the value of */ /* the SQRT in case the machine underestimates one of these square */ @@ -781,12 +781,12 @@ f"> */ /* rotations and permutation and then multiplying the center matrices */ /* against the current Z. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (k = 1; k <= i__1; ++k) { i__2 = *curlvl - k; i__3 = *curlvl - k - 1; - curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - + curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 1; psiz1 = prmptr[curr + 1] - prmptr[curr]; psiz2 = prmptr[curr + 2] - prmptr[curr + 1]; @@ -847,7 +847,7 @@ f"> */ c__1); i__2 = *tlvls - k; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L70: */ } diff --git a/lapack-netlib/SRC/dlalsa.c b/lapack-netlib/SRC/dlalsa.c index 891ed66a8..4d5c347c3 100644 --- a/lapack-netlib/SRC/dlalsa.c +++ b/lapack-netlib/SRC/dlalsa.c @@ -951,7 +951,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -965,7 +965,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1010,7 +1010,7 @@ L50: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/dlasd0.c b/lapack-netlib/SRC/dlasd0.c index c702665b0..0f88527ef 100644 --- a/lapack-netlib/SRC/dlasd0.c +++ b/lapack-netlib/SRC/dlasd0.c @@ -824,7 +824,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/dlasda.c b/lapack-netlib/SRC/dlasda.c index 72f9d55f3..a9190f805 100644 --- a/lapack-netlib/SRC/dlasda.c +++ b/lapack-netlib/SRC/dlasda.c @@ -1027,7 +1027,7 @@ f"> */ /* Now conquer each subproblem bottom-up. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); for (lvl = nlvl; lvl >= 1; --lvl) { lvl2 = (lvl << 1) - 1; @@ -1039,7 +1039,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/dstedc.c b/lapack-netlib/SRC/dstedc.c index ef2eeabe8..56511d6cf 100644 --- a/lapack-netlib/SRC/dstedc.c +++ b/lapack-netlib/SRC/dstedc.c @@ -806,10 +806,10 @@ f"> */ lwmin = *n - 1 << 1; } else { lgn = (integer) (log((doublereal) (*n)) / log(2.)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } if (icompz == 1) { diff --git a/lapack-netlib/SRC/slaed0.c b/lapack-netlib/SRC/slaed0.c index 33f7134c1..4c5230907 100644 --- a/lapack-netlib/SRC/slaed0.c +++ b/lapack-netlib/SRC/slaed0.c @@ -823,10 +823,10 @@ L10: temp = log((real) (*n)) / log(2.f); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/slaed7.c b/lapack-netlib/SRC/slaed7.c index 210d796d1..22fcaf76d 100644 --- a/lapack-netlib/SRC/slaed7.c +++ b/lapack-netlib/SRC/slaed7.c @@ -883,11 +883,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/slaeda.c b/lapack-netlib/SRC/slaeda.c index 7edaf8a76..3806427c2 100644 --- a/lapack-netlib/SRC/slaeda.c +++ b/lapack-netlib/SRC/slaeda.c @@ -753,7 +753,7 @@ f"> */ /* scheme */ i__1 = *curlvl - 1; - curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1; + curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1; /* Determine size of these matrices. We add HALF to the value of */ /* the SQRT in case the machine underestimates one of these square */ @@ -779,12 +779,12 @@ f"> */ /* rotations and permutation and then multiplying the center matrices */ /* against the current Z. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (k = 1; k <= i__1; ++k) { i__2 = *curlvl - k; i__3 = *curlvl - k - 1; - curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - + curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 1; psiz1 = prmptr[curr + 1] - prmptr[curr]; psiz2 = prmptr[curr + 2] - prmptr[curr + 1]; @@ -844,7 +844,7 @@ f"> */ c__1); i__2 = *tlvls - k; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L70: */ } diff --git a/lapack-netlib/SRC/slalsa.c b/lapack-netlib/SRC/slalsa.c index 53da2c7bf..77a79b80c 100644 --- a/lapack-netlib/SRC/slalsa.c +++ b/lapack-netlib/SRC/slalsa.c @@ -946,7 +946,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -960,7 +960,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1005,7 +1005,7 @@ L50: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/slasd0.c b/lapack-netlib/SRC/slasd0.c index aa553579e..be1a74191 100644 --- a/lapack-netlib/SRC/slasd0.c +++ b/lapack-netlib/SRC/slasd0.c @@ -821,7 +821,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/slasda.c b/lapack-netlib/SRC/slasda.c index 71424c3f1..1d336d1ce 100644 --- a/lapack-netlib/SRC/slasda.c +++ b/lapack-netlib/SRC/slasda.c @@ -1023,7 +1023,7 @@ f"> */ /* Now conquer each subproblem bottom-up. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); for (lvl = nlvl; lvl >= 1; --lvl) { lvl2 = (lvl << 1) - 1; @@ -1035,7 +1035,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/sstedc.c b/lapack-netlib/SRC/sstedc.c index 46ed15a1a..61ad3dd37 100644 --- a/lapack-netlib/SRC/sstedc.c +++ b/lapack-netlib/SRC/sstedc.c @@ -804,10 +804,10 @@ f"> */ lwmin = *n - 1 << 1; } else { lgn = (integer) (log((real) (*n)) / log(2.f)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } if (icompz == 1) { diff --git a/lapack-netlib/SRC/zlaed0.c b/lapack-netlib/SRC/zlaed0.c index 37bd12b01..2b25f6e4e 100644 --- a/lapack-netlib/SRC/zlaed0.c +++ b/lapack-netlib/SRC/zlaed0.c @@ -793,10 +793,10 @@ L10: temp = log((doublereal) (*n)) / log(2.); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/zlaed7.c b/lapack-netlib/SRC/zlaed7.c index 093051917..8665ee12c 100644 --- a/lapack-netlib/SRC/zlaed7.c +++ b/lapack-netlib/SRC/zlaed7.c @@ -864,11 +864,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/zlalsa.c b/lapack-netlib/SRC/zlalsa.c index d17016e7d..cd0819c3d 100644 --- a/lapack-netlib/SRC/zlalsa.c +++ b/lapack-netlib/SRC/zlalsa.c @@ -1051,7 +1051,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -1065,7 +1065,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1110,7 +1110,7 @@ L170: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/zstedc.c b/lapack-netlib/SRC/zstedc.c index 4cfc41840..55baba2d7 100644 --- a/lapack-netlib/SRC/zstedc.c +++ b/lapack-netlib/SRC/zstedc.c @@ -836,10 +836,10 @@ f"> */ lrwmin = *n - 1 << 1; } else if (icompz == 1) { lgn = (integer) (log((doublereal) (*n)) / log(2.)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } lwmin = *n * *n; From 91110f92d218492d0efbdc1fdf34277ca45f4b36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Sep 2022 14:03:31 +0200 Subject: [PATCH 016/154] fix missing return type in function declaration --- ctest/c_sblat1c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 4993d31bb..57e4707a9 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -969,7 +969,7 @@ real *sfac; 1.17 }; /* Local variables */ - extern /* Subroutine */ srottest_(); + extern /* Subroutine */ void srottest_(); static integer i__, k, ksize; extern /* Subroutine */ int stest_(), srotmtest_(); static integer ki, kn; From 9773a9d6b3da46a8c499d0dcc22030641006fa7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Sep 2022 17:04:11 +0200 Subject: [PATCH 017/154] undefine YIELDING for the Emscripten js converter --- common.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common.h b/common.h index e6002d322..4eeeb8d55 100644 --- a/common.h +++ b/common.h @@ -387,6 +387,10 @@ typedef int blasint; #endif */ +#ifdef __EMSCRIPTEN__ +#define YIELDING +#endif + #ifndef YIELDING #define YIELDING sched_yield() #endif From b285307e184f8ff2a3e430442756c735a0243671 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Sep 2022 17:05:24 +0200 Subject: [PATCH 018/154] Add a kludge for the Emscripten js converter --- ctest.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctest.c b/ctest.c index df628b1d4..2ccae8dcc 100644 --- a/ctest.c +++ b/ctest.c @@ -173,3 +173,8 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__EMSCRIPTEN__) +ARCH_RISCV64 +OS_WINDOWS +#endif + From 9402df5604e69f86f58953e3883f33f98c930baf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Sep 2022 21:44:34 +0200 Subject: [PATCH 019/154] Fix missing external declaration --- driver/others/blas_server_omp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 1a5fd06a3..c158f92ee 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -69,6 +69,8 @@ int blas_server_avail = 0; +extern int openblas_omp_adaptive_env(); + static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; #ifdef HAVE_C11 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; From 101a2c77c3f3610933f450cefca3e312edab2186 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Sep 2022 09:19:19 +0200 Subject: [PATCH 020/154] Fix warnings --- kernel/x86_64/dgemm_ncopy_8_skylakex.c | 24 ++++++++++++------------ kernel/x86_64/omatcopy_rt.c | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c index 74b336f3d..874ef68d6 100644 --- a/kernel/x86_64/dgemm_ncopy_8_skylakex.c +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -52,18 +52,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + FLOAT ctemp17 /*, ctemp18, ctemp19, ctemp20*/ ; + FLOAT /*ctemp21, ctemp22,*/ ctemp23, ctemp24; + FLOAT ctemp25 /*, ctemp26, ctemp27, ctemp28*/ ; + FLOAT /*ctemp29, ctemp30,*/ ctemp31, ctemp32; + FLOAT ctemp33 /*, ctemp34, ctemp35, ctemp36*/ ; + FLOAT /*ctemp37, ctemp38,*/ ctemp39, ctemp40; + FLOAT ctemp41 /*, ctemp42, ctemp43, ctemp44*/ ; + FLOAT /*ctemp45, ctemp46,*/ ctemp47, ctemp48; + FLOAT ctemp49 /*, ctemp50, ctemp51, ctemp52*/ ; + FLOAT /*ctemp53, ctemp54,*/ ctemp55, ctemp56; + FLOAT ctemp57 /*, ctemp58, ctemp59, ctemp60*/ ; + FLOAT /*ctemp61, ctemp62,*/ ctemp63, ctemp64; aoffset = a; diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c index e695f00c5..b11893f5d 100644 --- a/kernel/x86_64/omatcopy_rt.c +++ b/kernel/x86_64/omatcopy_rt.c @@ -142,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ } int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ - float *src, *dst, *dst_tmp, *src_base, *dst_base; + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; BLASLONG cols_left, rows_done; float ALPHA = alpha; if(ALPHA==0.0){ From 548a11b9d9aa7e5298f6a9092d917255d6f21644 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 16 Sep 2022 09:19:54 +0800 Subject: [PATCH 021/154] [WIP,Testing]: Add test for mips64 --- .github/workflows/mips64.yml | 114 +++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 .github/workflows/mips64.yml diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml new file mode 100644 index 000000000..a5bd7b84b --- /dev/null +++ b/.github/workflows/mips64.yml @@ -0,0 +1,114 @@ +name: mips64 qemu test + +on: [push, pull_request] + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: MIPS64_GENERIC + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=MIPS64_GENERIC + - target: SICORTEX + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=SICORTEX + - target: I6400 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6400 + - target: P6600 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=P6600 + - target: I6500 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6500 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross + + - name: checkout qemu + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 + + - name: build qemu + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system + make -j$(nproc) + make install + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-mips64el ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat From edea1bcfafef8aab3528fd1859df229fde47a913 Mon Sep 17 00:00:00 2001 From: gxw Date: Sat, 17 Sep 2022 16:39:30 +0800 Subject: [PATCH 022/154] MIPS64: Fixed failed utest dsdot:dsdot_n_1 when TARGET=I6500 --- kernel/mips/sdot_msa.c | 151 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c index e02e10c61..8c250d401 100644 --- a/kernel/mips/sdot_msa.c +++ b/kernel/mips/sdot_msa.c @@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; +#if defined(DSDOT) + v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7; + v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7; + v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 dot2 = {0, 0}; + v2f64 dot3 = {0, 0}; +#else v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; v4f32 dot2 = {0, 0, 0, 0}; v4f32 dot3 = {0, 0, 0, 0}; +#endif if (n < 1) return (dot); @@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x_pref += 32; y_pref += 32; +#if defined(DSDOT) + /* Extend single precision to double precision */ + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + dvy4 = __msa_fexupr_d(vy4); + dvy5 = __msa_fexupr_d(vy5); + dvy6 = __msa_fexupr_d(vy6); + dvy7 = __msa_fexupr_d(vy7); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + vy4 = (v4f32)__msa_fexupl_d(vy4); + vy5 = (v4f32)__msa_fexupl_d(vy5); + vy6 = (v4f32)__msa_fexupl_d(vy6); + vy7 = (v4f32)__msa_fexupl_d(vy7); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + dvx4 = __msa_fexupr_d(vx4); + dvx5 = __msa_fexupr_d(vx5); + dvx6 = __msa_fexupr_d(vx6); + dvx7 = __msa_fexupr_d(vx7); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + vx4 = (v4f32)__msa_fexupl_d(vx4); + vx5 = (v4f32)__msa_fexupl_d(vx5); + vx6 = (v4f32)__msa_fexupl_d(vx6); + vx7 = (v4f32)__msa_fexupl_d(vx7); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += (dvy4 * dvx4); + dot1 += (dvy5 * dvx5); + dot2 += (dvy6 * dvx6); + dot3 += (dvy7 * dvx7); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); + dot0 += ((v2f64)vy4 * (v2f64)vx4); + dot1 += ((v2f64)vy5 * (v2f64)vx5); + dot2 += ((v2f64)vy6 * (v2f64)vx6); + dot3 += ((v2f64)vy7 * (v2f64)vx7); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); @@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot1 += (vy5 * vx5); dot2 += (vy6 * vx6); dot3 += (vy7 * vx7); +#endif } if (n & 31) @@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); +#endif } if (n & 8) @@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP2_INC(x, 4, vx0, vx1); LD_SP2_INC(y, 4, vy0, vy1); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); +#endif } if (n & 4) @@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vx0 = LD_SP(x); x += 4; vy0 = LD_SP(y); y += 4; +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + vy0 = (v4f32)__msa_fexupl_d(vy0); + dvx0 = __msa_fexupr_d(vx0); + vx0 = (v4f32)__msa_fexupl_d(vx0); + dot0 += (dvy0 * dvx0); + dot0 += ((v2f64)vy0 * (v2f64)vx0); +#else dot0 += (vy0 * vx0); +#endif } if (n & 2) @@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } @@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += dot0[0]; dot += dot0[1]; +#if !defined(DSDOT) dot += dot0[2]; dot += dot0[3]; +#endif } else { @@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(y, inc_y, y0, y1, y2, y3); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); + dot += ((double)y2 * (double)x2); + dot += ((double)y3 * (double)x3); +#else dot += (y0 * x0); dot += (y1 * x1); dot += (y2 * x2); dot += (y3 * x3); +#endif } if (n & 2) @@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } From b1d69fb3ac429f64fcb18b2ef4283f1701d67aa2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 Sep 2022 23:52:32 +0200 Subject: [PATCH 023/154] Add MIPS64_GENERIC as a copy of GENERIC --- kernel/mips64/KERNEL.MIPS64_GENERIC | 160 ++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 kernel/mips64/KERNEL.MIPS64_GENERIC diff --git a/kernel/mips64/KERNEL.MIPS64_GENERIC b/kernel/mips64/KERNEL.MIPS64_GENERIC new file mode 100644 index 000000000..17f2ef976 --- /dev/null +++ b/kernel/mips64/KERNEL.MIPS64_GENERIC @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c From 84453b924fe7695029cad974dfe0cf7bf6ffe0f6 Mon Sep 17 00:00:00 2001 From: "Kai T. Ohlhus" Date: Thu, 22 Sep 2022 00:20:40 +0900 Subject: [PATCH 024/154] Support CONSISTENT_FPCSR on AARCH64 --- driver/others/blas_server.c | 8 ++++++++ driver/others/blas_server_omp.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 9cfd825ec..051513f27 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -470,9 +470,13 @@ blas_queue_t *tscq; #endif #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif +#endif #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; @@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ queue -> position = pos; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); #endif +#endif #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index c158f92ee..e06ab8404 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -284,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ sb = queue -> sb; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif #endif if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { @@ -383,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ #ifdef CONSISTENT_FPCSR for (i = 0; i < num; i ++) { +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); +#endif } #endif From c2892f0e31d41f5e8d6c1324c6592459c19b4c59 Mon Sep 17 00:00:00 2001 From: "Kai T. Ohlhus" Date: Thu, 22 Sep 2022 00:25:13 +0900 Subject: [PATCH 025/154] Makefile.rule: update CONSISTENT_FPCSR documentation --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 359672359..a0ad90a68 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -207,7 +207,7 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 -# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). # CONSISTENT_FPCSR = 1 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute From 11cd1080958dc17c36857f1a6d5d9e705f144440 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 26 Sep 2022 13:46:34 +0200 Subject: [PATCH 026/154] build: harden nightly-Homebrew-build.yml permissions Signed-off-by: Alex --- .github/workflows/nightly-Homebrew-build.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 29ec96f73..37ffe9e83 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -17,6 +17,10 @@ on: # it only makes sense to test if this file has been changed name: Nightly-Homebrew-Build + +permissions: + contents: read # to fetch code (actions/checkout) + jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest From 4de8e1b8f922e531e9c49d8deb35fef993d17ee4 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 26 Sep 2022 13:47:15 +0200 Subject: [PATCH 027/154] build: harden mips64.yml permissions Signed-off-by: Alex --- .github/workflows/mips64.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index a5bd7b84b..de7c0c0f3 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -2,6 +2,9 @@ name: mips64 qemu test on: [push, pull_request] +permissions: + contents: read # to fetch code (actions/checkout) + jobs: TEST: runs-on: ubuntu-latest From c726604319a038a7558d638985bbb60ac4983285 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 26 Sep 2022 13:48:11 +0200 Subject: [PATCH 028/154] build: harden dynamic_arch.yml permissions Signed-off-by: Alex --- .github/workflows/dynamic_arch.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 138a853dd..49139317c 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -2,6 +2,9 @@ name: continuous build on: [push, pull_request] +permissions: + contents: read # to fetch code (actions/checkout) + jobs: build: runs-on: ${{ matrix.os }} From f6f35a4288947091e51dda427537ecfb202ec904 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Sep 2022 08:47:14 +0200 Subject: [PATCH 029/154] fix copyobj declarations to work with DYNAMIC_ARCH --- kernel/mips64/KERNEL.MIPS64_GENERIC | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/mips64/KERNEL.MIPS64_GENERIC b/kernel/mips64/KERNEL.MIPS64_GENERIC index 17f2ef976..33bcbeedd 100644 --- a/kernel/mips64/KERNEL.MIPS64_GENERIC +++ b/kernel/mips64/KERNEL.MIPS64_GENERIC @@ -11,26 +11,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c From d2ce93179f6747380488db2a56102dab6fde18ca Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Thu, 22 Sep 2022 10:38:36 -0700 Subject: [PATCH 030/154] Add `OPENBLAS_DEFAULT_NUM_THREADS` This allows Julia to set a default number of threads (usually `1`) to be used when no other thread counts are specified [0], to short-circuit the default OpenBLAS thread initialization routine that spins up a different number of threads than Julia would otherwise choose. The reason to add a new environment variable is that we want to be able to configure OpenBLAS to avoid performing its initial memory allocation/thread startup, as that can consume significant amounts of memory, but we still want to be sensitive to legacy codebases that set things like `OMP_NUM_THREADS` or `GOTOBLAS_NUM_THREADS`. Creating a new environment variable that is openblas-specific and is not already publicly used to control the overall number of threads of programs like Julia seems to be the best way forward. [0] https://github.com/JuliaLang/julia/pull/46844 --- driver/others/init.c | 2 ++ driver/others/openblas_env.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/driver/others/init.c b/driver/others/init.c index cc3145a62..cd10e8d36 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) { if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS"); + if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS"); + numnodes = 1; if (numprocs == 1) { diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c index ef91a08e6..35b2270d4 100644 --- a/driver/others/openblas_env.c +++ b/driver/others/openblas_env.c @@ -67,10 +67,16 @@ void openblas_read_env() { openblas_env_thread_timeout=(unsigned int)ret; ret=0; - if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; openblas_env_openblas_num_threads=ret; + ret=0; + if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + if(ret != 0 || openblas_env_openblas_num_threads == 0) + openblas_env_openblas_num_threads=ret; + ret=0; if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; From 5e78493d956093413142064184c273615e44da0b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Sep 2022 11:55:56 +0200 Subject: [PATCH 031/154] Move Cray case after GNU as Cray builds of gfortran have both names in the version string --- f_check | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/f_check b/f_check index bb13e1640..02e078b8a 100755 --- a/f_check +++ b/f_check @@ -82,10 +82,6 @@ else vendor=FUJITSU openmp='-Kopenmp' ;; - *Cray*) - vendor=CRAY - openmp='-fopenmp' - ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -117,6 +113,10 @@ else esac fi ;; + *Cray*) + vendor=CRAY + openmp='-fopenmp' + ;; *g95*) vendor=G95 openmp='' From 79d842047ad1abd4f0d2ea9e4794564916db8041 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Sep 2022 11:58:15 +0200 Subject: [PATCH 032/154] Move Cray case after GNU as Cray builds of gfortran have both names in the version string --- f_check.pl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/f_check.pl b/f_check.pl index cfc7331c2..f093b9ad5 100644 --- a/f_check.pl +++ b/f_check.pl @@ -76,11 +76,6 @@ if ($compiler eq "") { $vendor = FUJITSU; $openmp = "-Kopenmp"; - } elsif ($data =~ /Cray/) { - - $vendor = CRAY; - $openmp = "-fopenmp"; - } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ s/\(+.*?\)+//g; @@ -106,6 +101,10 @@ if ($compiler eq "") { $openmp = ""; } } + } elsif ($data =~ /Cray/) { + + $vendor = CRAY; + $openmp = "-fopenmp"; } From db50ab4a720b45ce356a6f91b2c452c84e1e5a93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Oct 2022 15:14:51 +0200 Subject: [PATCH 033/154] Add BUILD_vartype defines --- driver/others/blas_server_win32.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 33b58f134..afa33cccc 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } @@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } From 57809526c430ef0a07f5c5c39dce20c6d73a1f35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Oct 2022 09:12:23 +0200 Subject: [PATCH 034/154] Disable the gfortran tree vectorizer for lapack-netlib --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 289f0eca5..56af9847e 100644 --- a/Makefile +++ b/Makefile @@ -278,7 +278,11 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(F_COMPILER), GFORTRAN) + -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc From 32566bfb44067e0c0459e94b53c9457613539eeb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Oct 2022 14:04:25 +0200 Subject: [PATCH 035/154] Disable the gfortran tree vectorizer for netlib LAPACK --- cmake/lapack.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index f8a27f5d4..3b221d420 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -999,6 +999,9 @@ endforeach () if (NOT C_LAPACK) set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + if (${F_COMPILER} STREQUAL "GFORTRAN") + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") + endif() else () set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") endif () From b2523471c9e6a398f0950952c376e136398a1cfe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 18 Oct 2022 16:16:26 +0200 Subject: [PATCH 036/154] Add libsuffix support --- openblas.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.pc.in b/openblas.pc.in index ff849807c..8ad6e8bee 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Libs.private: ${extralib} Cflags: -I${includedir} From 8bacea125426d6f8a01d604962217c4cd837f699 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 18 Oct 2022 16:18:29 +0200 Subject: [PATCH 037/154] Pass libsuffix to openblas.pc and fix passing of INTERFACE64/USE64BITINT flag --- Makefile.install | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 28727de37..adef0b5f4 100644 --- a/Makefile.install +++ b/Makefile.install @@ -152,8 +152,9 @@ endif #Generating openblas.pc @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'libsuffix='$(SYMBOLSUFFIX) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" From 747ade5adf36d2267c4f669238471f4eb793e462 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 18 Oct 2022 17:28:07 +0200 Subject: [PATCH 038/154] fix INTERFACE64/USE64BITINT reporting --- cmake/openblas.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 0bd49f996..7e120af86 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -2,7 +2,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ -openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ +openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ From 5f72415f10fb19bf7ab4283238c08eac537f267a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 18 Oct 2022 20:29:24 +0200 Subject: [PATCH 039/154] Suffix the pkgconfig file itself in INTERFACE64 builds --- Makefile.install | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/Makefile.install b/Makefile.install index adef0b5f4..87b5bc870 100644 --- a/Makefile.install +++ b/Makefile.install @@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(INTERFACE64),1) + SUFFIX64=64 +endif +PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" + ifeq ($(USE_OPENMP), 1) ifeq ($(C_COMPILER), PGI) PKG_EXTRALIB += -lomp @@ -150,14 +155,19 @@ endif endif #Generating openblas.pc - @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'libsuffix='$(SYMBOLSUFFIX) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" +ifeq ($(INTERFACE64),1) + SUFFIX64=64 +endif + PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" + + @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" + @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" + @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" + @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" + @echo 'version='$(VERSION) >> "$(PKGFILE)" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" + @cat openblas.pc.in >> "$(PKGFILE)" #Generating OpenBLASConfig.cmake From 9959a60873fbddc9dea23f4c32cc035147d1f351 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 20 Oct 2022 13:28:20 -0400 Subject: [PATCH 040/154] Benchmarks: align malloc'ed buffers. Benchmarks should allocate with cacheline (often 64 bytes) alignment to avoid unreliable timings. This technique, storing the offset in the byte before the pointer, doesn't require C11's aligned_alloc for compatibility with older compilers. For example, Glibc's x86_64 malloc returns 16-byte aligned buffers, which is not sufficient for AVX/AVX2 (32-byte preferred) or AVX512 (64-byte). --- benchmark/bench.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/benchmark/bench.h b/benchmark/bench.h index c03d72bef..f23e487aa 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){ #endif +/* Benchmarks should allocate with cacheline (often 64 bytes) alignment + to avoid unreliable results. This technique, storing the offset in the + byte before the pointer, doesn't require C11's aligned_alloc for + compatibility with older compilers. */ +static void *aligned_alloc_cacheline(size_t n) +{ + void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); + if (p) { + void **newp = (void **) + (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE); + newp[-1] = p; + p = newp; + } + return p; +} +#define malloc aligned_alloc_cacheline +#define free(p) free((p) ? ((void **)(p))[-1] : (p)) + #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; #elif defined(__APPLE__) From 9e6b060bf3d74dd9eac7325cb9e5cc262a5584a6 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 20 Oct 2022 20:11:09 -0400 Subject: [PATCH 041/154] Fix comment. It stores the pointer, not an offset (that would be an alternative approach). --- benchmark/bench.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/bench.h b/benchmark/bench.h index f23e487aa..1dae4d0fd 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -75,9 +75,9 @@ static void *huge_malloc(BLASLONG size){ #endif /* Benchmarks should allocate with cacheline (often 64 bytes) alignment - to avoid unreliable results. This technique, storing the offset in the - byte before the pointer, doesn't require C11's aligned_alloc for - compatibility with older compilers. */ + to avoid unreliable results. This technique, storing the allocated + pointer value just before the aligned memory, doesn't require + C11's aligned_alloc for compatibility with older compilers. */ static void *aligned_alloc_cacheline(size_t n) { void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); From b00d5b974637bd079c1d8cbbf5c406259aa5a804 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Wed, 19 Oct 2022 11:36:26 +0800 Subject: [PATCH 042/154] New sbgemm implementation for Neoverse N2 1. Use UZP instructions but not gather load and scatter store instructions to get lower latency. 2. Padding k to a power of 4. --- common_param.h | 1 + driver/level3/level3.c | 16 +- driver/level3/level3_thread.c | 14 +- kernel/arm64/KERNEL.NEOVERSEN2 | 11 +- .../arm64/sbgemm_kernel_neoversen2_newbf16.c | 467 ++++++++++++++++++ kernel/arm64/sbgemm_ncopy_4_neoversen2.c | 137 +++++ kernel/arm64/sbgemm_tcopy_8_neoversen2.c | 174 +++++++ kernel/setparam-ref.c | 9 +- 8 files changed, 818 insertions(+), 11 deletions(-) create mode 100644 kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c create mode 100644 kernel/arm64/sbgemm_ncopy_4_neoversen2.c create mode 100644 kernel/arm64/sbgemm_tcopy_8_neoversen2.c diff --git a/common_param.h b/common_param.h index 31fba9059..091840343 100644 --- a/common_param.h +++ b/common_param.h @@ -1193,6 +1193,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); #ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #endif + int align_k; // must be 2^n } gotoblas_t; extern gotoblas_t *gotoblas; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 4a8e193be..d3281345d 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -304,6 +304,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; } + BLASLONG pad_min_l = min_l; + +#if defined(HALF) && defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1); +#endif + +#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2) + pad_min_l = (min_l + 3) & ~3; +#endif + /* First, we have to move data A to L2 cache */ min_i = m_to - m_from; l1stride = 1; @@ -350,7 +360,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - sb + min_l * (jjs - js) * COMPSIZE * l1stride); + sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(outercost); @@ -358,10 +368,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); + sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #else KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, - sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); + sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #endif STOP_RPCC(kernelcost); diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index dfc7107b8..95c8e6d19 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -324,6 +324,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } + + BLASLONG pad_min_l = min_l; + +#if defined(HALF) && defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1); +#endif + +#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2) + pad_min_l = (min_l + 3) & ~3; +#endif /* Determine step size in m * Note: We are currently on the first step in m @@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Copy part of local region of B into workspace */ START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride); + buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(copy_B); /* Apply kernel with local region of A and part of local region of B */ START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride, + sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); STOP_RPCC(kernel); diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index 07a94a043..7fe9acd5c 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -189,11 +189,12 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMM_BETA = sbgemm_beta_neoversen2.c -SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c -SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c -SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c -SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c -SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c +# SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMKERNEL = sbgemm_kernel_neoversen2_newbf16.c +SBGEMMINCOPY = sbgemm_ncopy_4_neoversen2.c +SBGEMMITCOPY = sbgemm_tcopy_8_neoversen2.c +SBGEMMONCOPY = sbgemm_ncopy_4_neoversen2.c +SBGEMMOTCOPY = sbgemm_tcopy_8_neoversen2.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c b/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c new file mode 100644 index 000000000..1bf743c7f --- /dev/null +++ b/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c @@ -0,0 +1,467 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +#define LOAD_C(M, N) mc##M##N = svdup_f32(0); + +#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + +#define LOAD_C_8x4 \ + do { \ + LOAD_C(0, 0); \ + LOAD_C(0, 1); \ + LOAD_C(1, 0); \ + LOAD_C(1, 1); \ + LOAD_C(2, 0); \ + LOAD_C(2, 1); \ + LOAD_C(3, 0); \ + LOAD_C(3, 1); \ + } while (0); + +#define STORE_C(PG, PTR, SRC, DST) \ + do { \ + SRC = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, DST, SRC); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, + FLOAT *C, BLASLONG ldc) { + BLASLONG pad_k = (k + 3) & ~3; + + svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; + svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, + vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, + oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; + svfloat32_t svalpha = svdup_f32(alpha); + + svbool_t pg16 = svptrue_b16(); + svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbool_t pg32 = svptrue_b32(); + svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); + svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); + + bfloat16_t *ptr_a = (bfloat16_t *)A; + bfloat16_t *ptr_b = (bfloat16_t *)B; + FLOAT *ptr_c = C; + + bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; + bfloat16_t *ptr_b0, *ptr_b1; + FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + + for (BLASLONG j = 0; j < n / 4; j++) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + LOAD_C_8x4; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + +#if 0 + for (int q = 0; q < 8; q++) { + float tmp = 0; + *((bfloat16_t *)(&tmp) + 1) = ptr_b0[8+q]; + printf("%.1f ", tmp); + } + printf("\n"); +#endif + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + MATMUL(2, 0); MATMUL(2, 1); + MATMUL(3, 0); MATMUL(3, 1); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + vc4 = svuzp1(mc01, mc11); + vc5 = svuzp1(mc21, mc31); + vc6 = svuzp2(mc01, mc11); + vc7 = svuzp2(mc21, mc31); + + STORE_C(pg32, ptr_c0, oc0, vc0); + STORE_C(pg32, ptr_c0+4, oc1, vc1); + STORE_C(pg32, ptr_c1, oc2, vc2); + STORE_C(pg32, ptr_c1+4, oc3, vc3); + STORE_C(pg32, ptr_c2, oc4, vc4) + STORE_C(pg32, ptr_c2+4, oc5, vc5); + STORE_C(pg32, ptr_c3, oc6, vc6) + STORE_C(pg32, ptr_c3+4, oc7, vc7); + + ptr_c0 += 8; + ptr_c1 += 8; + ptr_c2 += 8; + ptr_c3 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + vc2 = svuzp1(mc01, mc11); + vc3 = svuzp2(mc01, mc11); + + STORE_C(pg32, ptr_c0, oc0, vc0); + STORE_C(pg32, ptr_c1, oc1, vc1); + STORE_C(pg32, ptr_c2, oc2, vc2); + STORE_C(pg32, ptr_c3, oc3, vc3); + + ptr_c0 += 4; + ptr_c1 += 4; + ptr_c2 += 4; + ptr_c3 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + LOAD_C(0, 0); LOAD_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 8; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + vc2 = svuzp1(mc01, mc01); + vc3 = svuzp2(mc01, mc01); + + STORE_C(pg32_low, ptr_c0, oc0, vc0); + STORE_C(pg32_low, ptr_c1, oc1, vc1); + STORE_C(pg32_low, ptr_c2, oc2, vc2); + STORE_C(pg32_low, ptr_c3, oc3, vc3); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + + LOAD_C(0, 0); LOAD_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 4; + ptr_b0 += 16; + } + + vc1 = svuzp2(mc00, mc00); + vc3 = svuzp2(mc01, mc01); + + STORE_C(pg32_first, ptr_c0, oc0, mc00); + STORE_C(pg32_first, ptr_c1, oc1, vc1); + STORE_C(pg32_first, ptr_c2, oc2, mc01); + STORE_C(pg32_first, ptr_c3, oc3, vc3); + + } + + ptr_b += 4 * pad_k; + } + + if (n & 2) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + LOAD_C(0, 0); + LOAD_C(1, 0); + LOAD_C(2, 0); + LOAD_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + + STORE_C(pg32, ptr_c0, oc0, vc0); + STORE_C(pg32, ptr_c0 + 4, oc1, vc1); + STORE_C(pg32, ptr_c1, oc2, vc2); + STORE_C(pg32, ptr_c1 + 4, oc3, vc3); + + ptr_c0 += 8; + ptr_c1 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + LOAD_C(0, 0); + LOAD_C(1, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + + STORE_C(pg32, ptr_c0, oc0, vc0); + STORE_C(pg32, ptr_c1, oc1, vc1); + + ptr_c0 += 4; + ptr_c1 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + LOAD_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + STORE_C(pg32_low, ptr_c0, oc0, vc0); + STORE_C(pg32_low, ptr_c1, oc1, vc1); + + ptr_c0 += 2; + ptr_c1 += 2; + + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + LOAD_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 8; + } + vc1 = svuzp2(mc00, mc00); + + STORE_C(pg32_first, ptr_c0, oc0, mc00); + STORE_C(pg32_first, ptr_c1, oc1, vc1); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { + ptr_c0 = ptr_c; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + LOAD_C(0, 0); + LOAD_C(1, 0); + LOAD_C(2, 0); + LOAD_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 4; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + + STORE_C(pg32, ptr_c0, oc0, vc0); + STORE_C(pg32, ptr_c0 + 4, oc1, vc1); + + ptr_c0 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + LOAD_C(0, 0); + LOAD_C(1, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc10); + STORE_C(pg32, ptr_c0, oc0, vc0); + ptr_c0 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + LOAD_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc00); + STORE_C(pg32_low, ptr_c0, oc0, vc0); + ptr_c0 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + LOAD_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 4; + } + STORE_C(pg32_first, ptr_c0, oc0, mc00); + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c new file mode 100644 index 000000000..0b0e7a427 --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c @@ -0,0 +1,137 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[4]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + +#if 0 + for (int line = 0; line < 4; line++) { + for (int p = 0; p < 4; p++) { + float tmp = 0; + *((bfloat16 *)(&tmp) + 1) = b_offset[line * 4 + p]; + printf("%f ", tmp); + } + printf("\n"); + } +#endif + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c new file mode 100644 index 000000000..6c37e4bcf --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c @@ -0,0 +1,174 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 8; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 8; line++) { +#if 0 + float fv0 = 0, fv1 = 0, fv2 = 0, fv3 = 0; + *((bfloat16 *)(&fv0) + 1) = a_offset0[line]; + *((bfloat16 *)(&fv1) + 1) = a_offset1[line]; + *((bfloat16 *)(&fv2) + 1) = a_offset2[line]; + *((bfloat16 *)(&fv3) + 1) = a_offset3[line]; + printf("%f %f %f %f\n", fv0, fv1, fv2, fv3); +#endif + + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + + b_offset += 32; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 8; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 32; + } + } + + if (n & 4) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +} diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 8bcd31ef2..010c39bd4 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -866,8 +866,9 @@ gotoblas_t TABLE_NAME = { cgeadd_kTS, #endif #if BUILD_COMPLEX16==1 - zgeadd_kTS + zgeadd_kTS, #endif + 0, // padding_k }; #if (ARCH_ARM64) @@ -972,6 +973,12 @@ static void init_parameter(void) { TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; #endif #endif + +#if defined(NEOVERSEN2) && BUILD_BFLOAT16 == 1 + TABLE_NAME.align_k = 4; +#else + TABLE_NAME.align_k = 1; +#endif } #else // (ARCH_ARM64) From 843e9fd0b9fb428a9d715cdc6506e57395311a27 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Wed, 26 Oct 2022 17:06:06 +0800 Subject: [PATCH 043/154] Fix typo error --- kernel/setparam-ref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 010c39bd4..effcf8965 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -868,7 +868,7 @@ gotoblas_t TABLE_NAME = { #if BUILD_COMPLEX16==1 zgeadd_kTS, #endif - 0, // padding_k + 1, // align_k }; #if (ARCH_ARM64) From e7fd8d21a6f88b098a25ab76d3360efa1d38f830 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Oct 2022 15:33:58 +0200 Subject: [PATCH 044/154] Add GEMMT based on looped GEMV --- interface/CMakeLists.txt | 2 +- interface/Makefile | 57 +++- interface/gemmt.c | 589 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 637 insertions(+), 11 deletions(-) create mode 100644 interface/gemmt.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 0b2998237..654684b71 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c - trsm.c syrk.c syr2k.c + trsm.c syrk.c syr2k.c gemmt.c ) set(BLAS3_MANGLED_SOURCES diff --git a/interface/Makefile b/interface/Makefile index abdac96e1..a1f4f66da 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -44,12 +44,12 @@ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) + sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -76,7 +76,7 @@ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) + dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -105,7 +105,7 @@ CBLAS3OBJS = \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) + cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -134,7 +134,7 @@ ZBLAS3OBJS = \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) + zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -281,12 +281,12 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -306,7 +306,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -331,7 +331,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -362,7 +362,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -1300,6 +1300,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h @@ -1320,6 +1322,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemmt.$(SUFFIX) xgemm.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1907,6 +1927,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + +cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemmt.c b/interface/gemmt.c new file mode 100644 index 000000000..3eed1dfe4 --- /dev/null +++ b/interface/gemmt.c @@ -0,0 +1,589 @@ +/*********************************************************************/ +/* Copyright 2022, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 +#ifdef XDOUBLE +#define ERROR_NAME "QGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMT " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMT " +#else +#define ERROR_NAME "SGEMT " +#endif +#else +#define SMP_THRESHOLD_MIN 8192.0 +#ifdef XDOUBLE +#define ERROR_NAME "XGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMT " +#else +#define ERROR_NAME "CGEMT " +#endif +#endif + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * N, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, n, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; +#if defined(COMPLEX) + FLOAT alpha_r, alpha_i, beta_r, beta_i; +#else + FLOAT alpha, beta; +#endif + + PRINT_DEBUG_NAME; + + m = *M; + n = *N; + k = *K; + +#if defined(COMPLEX) + FLOAT *alpha = Alpha; + alpha_r = *(Alpha + 0); + alpha_i = *(Alpha + 1); + + beta_r = *(Beta + 0); + beta_i = *(Beta + 1); +#else + alpha = *Alpha; + beta = *Beta; +#endif + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; +#ifndef COMPLEX + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; +#else + if (transA == 'R') + transa = 2; + if (transA == 'C') + transa = 3; +#endif + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; +#ifndef COMPLEX + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; +#else + if (transB == 'R') + transb = 2; + if (transB == 'C') + transb = 3; +#endif + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + + info = 0; + + if (uplo < 0) + info = 14; + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + if (info) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, + blasint N, blasint k, +#ifndef COMPLEX + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ +#else + void *valpha, + void *va, blasint LDA, + void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc) +{ + FLOAT *alpha = (FLOAT *) valpha; + FLOAT *beta = (FLOAT *) vbeta; + FLOAT *A = (FLOAT *) va; + FLOAT *B = (FLOAT *) vb; + FLOAT *c = (FLOAT *) vc; +#endif + FLOAT *aa, *bb, *cc; + + int transa, transb, uplo; + blasint info; + blasint m, n, lda, ldb; + FLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; +#else + if (TransA == CblasConjNoTrans) + transa = 2; + if (TransA == CblasConjTrans) + transa = 3; +#endif + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; +#else + if (TransB == CblasConjNoTrans) + transb = 2; + if (TransB == CblasConjTrans) + transb = 3; +#endif + + m = M; + n = N; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + } + + if (order == CblasRowMajor) { + m = N; + n = M; + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; +#else + if (TransB == CblasConjNoTrans) + transa = 2; + if (TransB == CblasConjTrans) + transa = 3; +#endif + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; +#else + if (TransA == CblasConjNoTrans) + transb = 2; + if (TransA == CblasConjTrans) + transb = 3; +#endif + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + } + + uplo = -1; + if (Uplo == CblasUpper) + uplo = 0; + if (Uplo == CblasLower) + uplo = 1; + if (uplo < 0) + info = 14; + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#if defined(COMPLEX) + FLOAT alpha_r = *(alpha + 0); + FLOAT alpha_i = *(alpha + 1); + + FLOAT beta_r = *(beta + 0); + FLOAT beta_i = *(beta + 1); +#endif + +#endif + int buffer_size; + blasint l; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + +#if defined(COMPLEX) + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, + xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, + xgemv_thread_d, +#elif defined DOUBLE + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, + zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, + zgemv_thread_d, +#else + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, + cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, + cgemv_thread_d, +#endif + }; +#endif + + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, + FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,}; + +#else + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgemv_thread_n, qgemv_thread_t, +#elif defined DOUBLE + dgemv_thread_n, dgemv_thread_t, +#else + sgemv_thread_n, sgemv_thread_t, +#endif + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T,}; + +#endif + + if ((m == 0) || (n == 0)) + return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + const blasint incb = (transb == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < n; i++) { + j = n - i; + + l = j; +#if defined(COMPLEX) + aa = a + i * 2; + bb = b + i * ldb * 2; + if (transa) { + l = k; + aa = a + lda * i * 2; + bb = b + i * 2; + } + cc = c + i * 2 * ldc + i * 2; +#else + aa = a + i; + bb = b + i * ldb; + if (transa) { + l = k; + aa = a + lda * i; + bb = b + i; + } + cc = c + i * ldc + i; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); +#endif +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < n; i++) { + j = i + 1; + + l = j; +#if defined COMPLEX + bb = b + i * ldb * 2; + if (transa) { + l = k; + bb = b + i * 2; + } + cc = c + i * 2 * ldc; +#else + bb = b + i * ldb; + if (transa) { + l = k; + bb = b + i; + } + cc = c + i * ldc; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); +#endif + +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + args.m * args.k + args.k * args.n + + args.m * args.n, 2 * args.m * args.n * args.k); + + IDEBUG_END; + + return; +} From 4989e039a5b37de140b41df9a042720599336e29 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Thu, 27 Oct 2022 14:10:26 +0800 Subject: [PATCH 045/154] Define SBGEMM_ALIGN_K for DYNAMIC_ARCH build --- common_param.h | 2 +- driver/level3/level3.c | 11 +++++------ driver/level3/level3_thread.c | 10 +++++----- kernel/setparam-ref.c | 8 ++------ param.h | 5 +++++ 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/common_param.h b/common_param.h index 091840343..e14ef2782 100644 --- a/common_param.h +++ b/common_param.h @@ -50,6 +50,7 @@ typedef struct { #ifdef BUILD_BFLOAT16 int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; + int sbgemm_align_k; void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); @@ -1193,7 +1194,6 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); #ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #endif - int align_k; // must be 2^n } gotoblas_t; extern gotoblas_t *gotoblas; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index d3281345d..b7328876b 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -305,13 +305,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } BLASLONG pad_min_l = min_l; - -#if defined(HALF) && defined(DYNAMIC_ARCH) - pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1); +#if defined(HALF) +#if defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); +#else + pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; #endif - -#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2) - pad_min_l = (min_l + 3) & ~3; #endif /* First, we have to move data A to L2 cache */ diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 95c8e6d19..02b60b50d 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -327,12 +327,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG pad_min_l = min_l; -#if defined(HALF) && defined(DYNAMIC_ARCH) - pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1); +#if defined(HALF) +#if defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); +#else + pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; #endif - -#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2) - pad_min_l = (min_l + 3) & ~3; #endif /* Determine step size in m diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index effcf8965..16d19af1b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = { MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), #endif + SBGEMM_ALIGN_K, + sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS, @@ -973,12 +975,6 @@ static void init_parameter(void) { TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; #endif #endif - -#if defined(NEOVERSEN2) && BUILD_BFLOAT16 == 1 - TABLE_NAME.align_k = 4; -#else - TABLE_NAME.align_k = 1; -#endif } #else // (ARCH_ARM64) diff --git a/param.h b/param.h index b9b9a55e8..514b13a3a 100644 --- a/param.h +++ b/param.h @@ -79,6 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 256 #define SBGEMM_DEFAULT_R 256 #define SBGEMM_DEFAULT_Q 256 +#define SBGEMM_ALIGN_K 1 // must be 2^x + #ifdef OPTERON #define SNUMOPT 4 @@ -3394,6 +3396,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) +#undef SBGEMM_ALIGN_K +#define SBGEMM_ALIGN_K 4 + #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N #define SBGEMM_DEFAULT_UNROLL_M 8 From e7e3aa29482281edba46a27fcd452d7ed630f46a Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 27 Oct 2022 17:20:44 -0400 Subject: [PATCH 046/154] x86_64: prevent GCC and Clang from generating FMAs in cscal/zscal. If e.g. -march=haswell is set in CFLAGS, GCC generates FMAs by default, which is inconsistent with the microkernels, none of which use FMAs. These inconsistencies cause a few failures in the LAPACK testcases, where eigenvalue results with/without eigenvectors are compared. Moreover using FMAs for multiplication of complex numbers can give surprising results, see 22aa81f for more information. This uses the same syntax as used in 22aa81f for zarch (s390x). --- kernel/x86_64/cscal.c | 13 +++++++++++++ kernel/x86_64/zscal.c | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index dc3f688c6..6ae66d973 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -25,6 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 3744c98bb..dfdb4230b 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -25,6 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" From 79066b6bf3c460caedd4ebbef4d17541fc0369bc Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Fri, 28 Oct 2022 17:09:39 +0800 Subject: [PATCH 047/154] Change file name to match the norm and delete useless code. --- kernel/arm64/KERNEL.NEOVERSEN2 | 11 +- kernel/arm64/sbgemm_kernel_8x4_neoversen2.c | 10 +- .../arm64/sbgemm_kernel_8x4_neoversen2_impl.c | 1004 +++++++---------- .../arm64/sbgemm_kernel_neoversen2_newbf16.c | 467 -------- kernel/arm64/sbgemm_ncopy_4_neoversen2.c | 11 - kernel/arm64/sbgemm_ncopy_neoversen2.c | 101 -- kernel/arm64/sbgemm_tcopy_8_neoversen2.c | 9 - kernel/arm64/sbgemm_tcopy_neoversen2.c | 109 -- 8 files changed, 415 insertions(+), 1307 deletions(-) delete mode 100644 kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c delete mode 100644 kernel/arm64/sbgemm_ncopy_neoversen2.c delete mode 100644 kernel/arm64/sbgemm_tcopy_neoversen2.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index 7fe9acd5c..ae386d6e1 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -189,12 +189,11 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMM_BETA = sbgemm_beta_neoversen2.c -# SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c -SBGEMMKERNEL = sbgemm_kernel_neoversen2_newbf16.c -SBGEMMINCOPY = sbgemm_ncopy_4_neoversen2.c -SBGEMMITCOPY = sbgemm_tcopy_8_neoversen2.c -SBGEMMONCOPY = sbgemm_ncopy_4_neoversen2.c -SBGEMMOTCOPY = sbgemm_tcopy_8_neoversen2.c +SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c +SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c index 66e7dd38a..4c1385fbe 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c @@ -37,9 +37,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { - if (alpha == 1.0f) - return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); - else - return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); - return 0; + if (alpha == 1.0f) + return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); + return 0; } diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c index 7d53b1aa0..26ea7ee61 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c @@ -30,636 +30,442 @@ #include "common.h" +#define INIT_C(M, N) mc##M##N = svdup_f32(0); + +#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + +#define INIT_C_8x4 \ + do { \ + INIT_C(0, 0); \ + INIT_C(0, 1); \ + INIT_C(1, 0); \ + INIT_C(1, 1); \ + INIT_C(2, 0); \ + INIT_C(2, 1); \ + INIT_C(3, 0); \ + INIT_C(3, 1); \ + } while (0); + #ifdef ALPHA_ONE -#define LOAD_C(M, N) \ - mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svadd_z((PG), SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#else +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#endif -#define LOAD_C_LOW(M, N) \ - mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc); +#ifdef ALPHA_ONE +int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG pad_k = (k + 3) & ~3; + + svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; + svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, + vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, + oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; + svfloat32_t svalpha = svdup_f32(alpha); + + svbool_t pg16 = svptrue_b16(); + svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbool_t pg32 = svptrue_b32(); + svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); + svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); + + bfloat16_t *ptr_a = (bfloat16_t *)A; + bfloat16_t *ptr_b = (bfloat16_t *)B; + FLOAT *ptr_c = C; + + bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; + bfloat16_t *ptr_b0, *ptr_b1; + FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + + for (BLASLONG j = 0; j < n / 4; j++) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C_8x4; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + MATMUL(2, 0); MATMUL(2, 1); + MATMUL(3, 0); MATMUL(3, 1); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + vc4 = svuzp1(mc01, mc11); + vc5 = svuzp1(mc21, mc31); + vc6 = svuzp2(mc01, mc11); + vc7 = svuzp2(mc21, mc31); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0+4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1+4, oc3, vc3); + UPDATE_C(pg32, ptr_c2, oc4, vc4) + UPDATE_C(pg32, ptr_c2+4, oc5, vc5); + UPDATE_C(pg32, ptr_c3, oc6, vc6) + UPDATE_C(pg32, ptr_c3+4, oc7, vc7); + + ptr_c0 += 8; + ptr_c1 += 8; + ptr_c2 += 8; + ptr_c3 += 8; + } -#define LOAD_C_EVEN(M, N) \ - mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc); + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + INIT_C(1, 0); INIT_C(1, 1); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + vc2 = svuzp1(mc01, mc11); + vc3 = svuzp2(mc01, mc11); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + UPDATE_C(pg32, ptr_c2, oc2, vc2); + UPDATE_C(pg32, ptr_c3, oc3, vc3); + + ptr_c0 += 4; + ptr_c1 += 4; + ptr_c2 += 4; + ptr_c3 += 4; + } -#define LOAD_C_FIRST(M, N) \ - mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc); + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 8; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + vc2 = svuzp1(mc01, mc01); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + UPDATE_C(pg32_low, ptr_c2, oc2, vc2); + UPDATE_C(pg32_low, ptr_c3, oc3, vc3); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } -#define STORE_C(M, N) \ - svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; -#define STORE_C_LOW(M, N) \ - svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); -#define STORE_C_EVEN(M, N) \ - svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); + MATMUL(0, 0); MATMUL(0, 1); -#define STORE_C_FIRST(M, N) \ - svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); + ptr_a0 += 4; + ptr_b0 += 16; + } -#else -#define LOAD_C(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); + vc1 = svuzp2(mc00, mc00); + vc3 = svuzp2(mc01, mc01); -#define LOAD_C_LOW(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc); + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first, ptr_c2, oc2, mc01); + UPDATE_C(pg32_first, ptr_c3, oc3, vc3); -#define LOAD_C_EVEN(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc); + } -#define LOAD_C_FIRST(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc); + ptr_b += 4 * pad_k; + } -#define STORE_C(M, N) \ - mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); + if (n & 2) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + ptr_a = (bfloat16_t *)A; -#define STORE_C_LOW(M, N) \ - mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; -#define STORE_C_EVEN(M, N) \ - mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); + ptr_b0 = ptr_b; -#define STORE_C_FIRST(M, N) \ - mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); -#endif + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); -#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M); + mb0 = svld1_bf16(pg16, ptr_b0); -#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N); + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); -#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + ptr_a0 += 32; + ptr_b0 += 8; + } -#define LOAD_KREST_1(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \ - *(ptr_##NAME##M + 1), zero, zero, zero); + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); -#define LOAD_KREST_1_LOW(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \ - zero, zero); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3); -#define LOAD_KREST_2(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \ - *(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero); + ptr_c0 += 8; + ptr_c1 += 8; + } -#define LOAD_KREST_2_LOW(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \ - zero, zero, zero, zero, zero); + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; -#define LOAD_KREST_3(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ - *(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \ - *(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero); + INIT_C(0, 0); + INIT_C(1, 0); -#define LOAD_KREST_3_LOW(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ - *(ptr_##NAME##M + 2), zero, zero, zero, zero, zero); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 8; + } + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + + ptr_c0 += 4; + ptr_c1 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + + ptr_c0 += 2; + ptr_c1 += 2; -#ifdef ALPHA_ONE -int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#else -int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#endif -{ - bfloat16_t *ptr_a = (bfloat16_t *)A; - bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; - - bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; - bfloat16_t *ptr_b0, *ptr_b1; - FLOAT *ptr_c00, *ptr_c01; - - svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; - svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31; -#ifndef ALPHA_ONE - svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31; -#endif - svbool_t pg16 = svptrue_b16(); - svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbool_t pg32 = svptrue_b32(); - svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); - svbool_t pg32_even = svdupq_b32(1, 0, 1, 0); - svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); - svfloat32_t svalpha = svdup_f32(alpha); - bfloat16 tmp = 0; - bfloat16_t zero = *((bfloat16_t *)&tmp); - BLASLONG krest = k & 3; - - // 00 01 10 11 - svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1); - - for (BLASLONG j = 0; j < n / 4; j++) { - ptr_c00 = ptr_c; - ptr_c01 = ptr_c + 2 * ldc; - ptr_c += 4 * ldc; - - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - LOAD_C(2, 0); LOAD_C(2, 1); - LOAD_C(3, 0); LOAD_C(3, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - } - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - STORE_C(2, 0); STORE_C(2, 1); - STORE_C(3, 0); STORE_C(3, 1); - - ptr_c00 += 8; ptr_c01 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - } - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - - ptr_c00 += 4; ptr_c01 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C(0, 0); STORE_C(0, 1); - ptr_c00 += 2; ptr_c01 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 4; - ptr_b0 += 8; - ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C_LOW(0, 0); STORE_C_LOW(0, 1); - } - - ptr_b += 4 * k; } - if (n & 2) { - ptr_c00 = ptr_c; - ptr_c += 2 * ldc; - - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - LOAD_C(2, 0); - LOAD_C(3, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - LOAD_B(0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - } - - STORE_C(0, 0); - STORE_C(1, 0); - STORE_C(2, 0); - STORE_C(3, 0); - - ptr_c00 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - LOAD_B(0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C(0, 0) - STORE_C(1, 0) - - ptr_c00 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - LOAD_B(0); - MATMUL(0, 0); - ptr_a0 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - } - STORE_C(0, 0); - ptr_c00 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - LOAD_B(0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - } - STORE_C_LOW(0, 0); - } - - ptr_b += 2 * k; + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 8; + } + vc1 = svuzp2(mc00, mc00); + + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { + ptr_c0 = ptr_c; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 4; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + + ptr_c0 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + INIT_C(0, 0); + INIT_C(1, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc10); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + ptr_c0 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + ptr_c0 += 2; } - if (n & 1) { - ptr_c00 = ptr_c; - ptr_a = (bfloat16_t *) A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - LOAD_C_EVEN(1, 0); - LOAD_C_EVEN(2, 0); - LOAD_C_EVEN(3, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - } - STORE_C_EVEN(0, 0) - STORE_C_EVEN(1, 0); - STORE_C_EVEN(2, 0); - STORE_C_EVEN(3, 0); - - ptr_c00 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - LOAD_C_EVEN(1, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C_EVEN(0, 0) - STORE_C_EVEN(1, 0) - - ptr_c00 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - } - STORE_C_EVEN(0, 0); - ptr_c00 += 2; - } - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - LOAD_C_FIRST(0, 0); - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 4; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - } - STORE_C_FIRST(0, 0); - } + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 4; + } + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); } + } - return 0; -} \ No newline at end of file + return 0; +} diff --git a/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c b/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c deleted file mode 100644 index 1bf743c7f..000000000 --- a/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c +++ /dev/null @@ -1,467 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include - -#include "common.h" - -#define LOAD_C(M, N) mc##M##N = svdup_f32(0); - -#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); - -#define LOAD_C_8x4 \ - do { \ - LOAD_C(0, 0); \ - LOAD_C(0, 1); \ - LOAD_C(1, 0); \ - LOAD_C(1, 1); \ - LOAD_C(2, 0); \ - LOAD_C(2, 1); \ - LOAD_C(3, 0); \ - LOAD_C(3, 1); \ - } while (0); - -#define STORE_C(PG, PTR, SRC, DST) \ - do { \ - SRC = svld1_f32((PG), (PTR)); \ - DST = svmad_z((PG), svalpha, DST, SRC); \ - svst1_f32((PG), (PTR), DST); \ - } while (0); - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, - FLOAT *C, BLASLONG ldc) { - BLASLONG pad_k = (k + 3) & ~3; - - svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; - svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, - vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, - oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; - svfloat32_t svalpha = svdup_f32(alpha); - - svbool_t pg16 = svptrue_b16(); - svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbool_t pg32 = svptrue_b32(); - svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); - svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); - - bfloat16_t *ptr_a = (bfloat16_t *)A; - bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; - - bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; - bfloat16_t *ptr_b0, *ptr_b1; - FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; - - for (BLASLONG j = 0; j < n / 4; j++) { - ptr_c0 = ptr_c; - ptr_c1 = ptr_c0 + ldc; - ptr_c2 = ptr_c1 + ldc; - ptr_c3 = ptr_c2 + ldc; - ptr_c += 4 * ldc; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a += 8 * pad_k; - - ptr_b0 = ptr_b; - - LOAD_C_8x4; - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); - - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - -#if 0 - for (int q = 0; q < 8; q++) { - float tmp = 0; - *((bfloat16_t *)(&tmp) + 1) = ptr_b0[8+q]; - printf("%.1f ", tmp); - } - printf("\n"); -#endif - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - - ptr_a0 += 32; - ptr_b0 += 16; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp1(mc20, mc30); - vc2 = svuzp2(mc00, mc10); - vc3 = svuzp2(mc20, mc30); - vc4 = svuzp1(mc01, mc11); - vc5 = svuzp1(mc21, mc31); - vc6 = svuzp2(mc01, mc11); - vc7 = svuzp2(mc21, mc31); - - STORE_C(pg32, ptr_c0, oc0, vc0); - STORE_C(pg32, ptr_c0+4, oc1, vc1); - STORE_C(pg32, ptr_c1, oc2, vc2); - STORE_C(pg32, ptr_c1+4, oc3, vc3); - STORE_C(pg32, ptr_c2, oc4, vc4) - STORE_C(pg32, ptr_c2+4, oc5, vc5); - STORE_C(pg32, ptr_c3, oc6, vc6) - STORE_C(pg32, ptr_c3+4, oc7, vc7); - - ptr_c0 += 8; - ptr_c1 += 8; - ptr_c2 += 8; - ptr_c3 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - ptr_a0 += 16; - ptr_b0 += 16; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp2(mc00, mc10); - vc2 = svuzp1(mc01, mc11); - vc3 = svuzp2(mc01, mc11); - - STORE_C(pg32, ptr_c0, oc0, vc0); - STORE_C(pg32, ptr_c1, oc1, vc1); - STORE_C(pg32, ptr_c2, oc2, vc2); - STORE_C(pg32, ptr_c3, oc3, vc3); - - ptr_c0 += 4; - ptr_c1 += 4; - ptr_c2 += 4; - ptr_c3 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); LOAD_C(0, 1); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 8; - ptr_b0 += 16; - } - - vc0 = svuzp1(mc00, mc00); - vc1 = svuzp2(mc00, mc00); - vc2 = svuzp1(mc01, mc01); - vc3 = svuzp2(mc01, mc01); - - STORE_C(pg32_low, ptr_c0, oc0, vc0); - STORE_C(pg32_low, ptr_c1, oc1, vc1); - STORE_C(pg32_low, ptr_c2, oc2, vc2); - STORE_C(pg32_low, ptr_c3, oc3, vc3); - - ptr_c0 += 2; - ptr_c1 += 2; - ptr_c2 += 2; - ptr_c3 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); LOAD_C(0, 1); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 4; - ptr_b0 += 16; - } - - vc1 = svuzp2(mc00, mc00); - vc3 = svuzp2(mc01, mc01); - - STORE_C(pg32_first, ptr_c0, oc0, mc00); - STORE_C(pg32_first, ptr_c1, oc1, vc1); - STORE_C(pg32_first, ptr_c2, oc2, mc01); - STORE_C(pg32_first, ptr_c3, oc3, vc3); - - } - - ptr_b += 4 * pad_k; - } - - if (n & 2) { - ptr_c0 = ptr_c; - ptr_c1 = ptr_c0 + ldc; - ptr_c += 2 * ldc; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a += 8 * pad_k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - LOAD_C(2, 0); - LOAD_C(3, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); - - mb0 = svld1_bf16(pg16, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 32; - ptr_b0 += 8; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp1(mc20, mc30); - vc2 = svuzp2(mc00, mc10); - vc3 = svuzp2(mc20, mc30); - - STORE_C(pg32, ptr_c0, oc0, vc0); - STORE_C(pg32, ptr_c0 + 4, oc1, vc1); - STORE_C(pg32, ptr_c1, oc2, vc2); - STORE_C(pg32, ptr_c1 + 4, oc3, vc3); - - ptr_c0 += 8; - ptr_c1 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16, ptr_b0); - MATMUL(0, 0); - MATMUL(1, 0); - ptr_a0 += 16; - ptr_b0 += 8; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp2(mc00, mc10); - - STORE_C(pg32, ptr_c0, oc0, vc0); - STORE_C(pg32, ptr_c1, oc1, vc1); - - ptr_c0 += 4; - ptr_c1 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 8; - } - - vc0 = svuzp1(mc00, mc00); - vc1 = svuzp2(mc00, mc00); - STORE_C(pg32_low, ptr_c0, oc0, vc0); - STORE_C(pg32_low, ptr_c1, oc1, vc1); - - ptr_c0 += 2; - ptr_c1 += 2; - - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - LOAD_C(0, 0); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 8; - } - vc1 = svuzp2(mc00, mc00); - - STORE_C(pg32_first, ptr_c0, oc0, mc00); - STORE_C(pg32_first, ptr_c1, oc1, vc1); - } - - ptr_b += 2 * pad_k; - } - - if (n & 1) { - ptr_c0 = ptr_c; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a += 8 * pad_k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - LOAD_C(2, 0); - LOAD_C(3, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); - - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 32; - ptr_b0 += 4; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp1(mc20, mc30); - - STORE_C(pg32, ptr_c0, oc0, vc0); - STORE_C(pg32, ptr_c0 + 4, oc1, vc1); - - ptr_c0 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - ptr_b0 = ptr_b; - LOAD_C(0, 0); - LOAD_C(1, 0); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16_low, ptr_b0); - MATMUL(0, 0); - MATMUL(1, 0); - ptr_a0 += 16; - ptr_b0 += 4; - } - vc0 = svuzp1(mc00, mc10); - STORE_C(pg32, ptr_c0, oc0, vc0); - ptr_c0 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 4; - } - vc0 = svuzp1(mc00, mc00); - STORE_C(pg32_low, ptr_c0, oc0, vc0); - ptr_c0 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - LOAD_C(0, 0); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 4; - } - STORE_C(pg32_first, ptr_c0, oc0, mc00); - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c index 0b0e7a427..22978a388 100644 --- a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c +++ b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c @@ -58,17 +58,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); -#if 0 - for (int line = 0; line < 4; line++) { - for (int p = 0; p < 4; p++) { - float tmp = 0; - *((bfloat16 *)(&tmp) + 1) = b_offset[line * 4 + p]; - printf("%f ", tmp); - } - printf("\n"); - } -#endif - b_offset += 16; a_offsetx[0] += 4; a_offsetx[1] += 4; diff --git a/kernel/arm64/sbgemm_ncopy_neoversen2.c b/kernel/arm64/sbgemm_ncopy_neoversen2.c deleted file mode 100644 index 594067ebb..000000000 --- a/kernel/arm64/sbgemm_ncopy_neoversen2.c +++ /dev/null @@ -1,101 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - - a_offset = a; - b_offset = b; - - for (BLASLONG j = 0; j < n / 2; j++) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset1 + 2); - *(b_offset + 3) = *(a_offset1 + 3); - *(b_offset + 4) = *(a_offset2 + 0); - *(b_offset + 5) = *(a_offset2 + 1); - *(b_offset + 6) = *(a_offset2 + 2); - *(b_offset + 7) = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - b_offset += 8; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset1 + 2); - *(b_offset + 3) = *(a_offset2 + 0); - *(b_offset + 4) = *(a_offset2 + 1); - *(b_offset + 5) = *(a_offset2 + 2); - b_offset += 6; - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset2 + 0); - *(b_offset + 3) = *(a_offset2 + 1); - b_offset += 4; - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - b_offset += 2; - } - } - if (n & 1) { - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - *(b_offset + 2) = *(a_offset + 2); - *(b_offset + 3) = *(a_offset + 3); - - b_offset += 4; - a_offset += 4; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - *(b_offset + 2) = *(a_offset + 2); - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c index 6c37e4bcf..a058b5a8e 100644 --- a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c +++ b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c @@ -43,15 +43,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { for (BLASLONG i = 0; i < m / 4; i++) { for (BLASLONG line = 0; line < 8; line++) { -#if 0 - float fv0 = 0, fv1 = 0, fv2 = 0, fv3 = 0; - *((bfloat16 *)(&fv0) + 1) = a_offset0[line]; - *((bfloat16 *)(&fv1) + 1) = a_offset1[line]; - *((bfloat16 *)(&fv2) + 1) = a_offset2[line]; - *((bfloat16 *)(&fv3) + 1) = a_offset3[line]; - printf("%f %f %f %f\n", fv0, fv1, fv2, fv3); -#endif - b_offset[line * 4] = a_offset0[line]; b_offset[line * 4 + 1] = a_offset1[line]; b_offset[line * 4 + 2] = a_offset2[line]; diff --git a/kernel/arm64/sbgemm_tcopy_neoversen2.c b/kernel/arm64/sbgemm_tcopy_neoversen2.c deleted file mode 100644 index 2f3313379..000000000 --- a/kernel/arm64/sbgemm_tcopy_neoversen2.c +++ /dev/null @@ -1,109 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - IFLOAT *b_offset; - a_offset = a; - b_offset = b; - - for (BLASLONG j = 0; j < n / 2; j++) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 2; - - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset3 + 0); - *(b_offset + 3) = *(a_offset4 + 0); - *(b_offset + 4) = *(a_offset1 + 1); - *(b_offset + 5) = *(a_offset2 + 1); - *(b_offset + 6) = *(a_offset3 + 1); - *(b_offset + 7) = *(a_offset4 + 1); - - b_offset += 8; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - a_offset4 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset3 + 0); - *(b_offset + 3) = *(a_offset1 + 1); - *(b_offset + 4) = *(a_offset2 + 1); - *(b_offset + 5) = *(a_offset3 + 1); - b_offset += 6; - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset1 + 1); - *(b_offset + 3) = *(a_offset2 + 1); - b_offset += 4; - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - b_offset += 2; - } - } - } - if (n & 1) { - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - *(b_offset + 2) = *(a_offset + lda * 2); - *(b_offset + 3) = *(a_offset + lda * 3); - - b_offset += 4; - a_offset += 4 * lda; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - *(b_offset + 2) = *(a_offset + lda * 2); - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset); - } - } - - return 0; -} From e27ad3a6cc248a3a21da9f3cbc6855c5b48cff04 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 28 Oct 2022 09:10:40 -0400 Subject: [PATCH 048/154] add raptor lake ids --- cpuid_x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4ac1de047..357376f42 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1544,6 +1544,13 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 11: //family 6 exmodel 11 + switch (model) { + case 7: // Raptor Lake + if(support_avx2()) + return CPUTYPE_HASWELL; + } + break; } break; case 0x7: @@ -2334,6 +2341,12 @@ int get_coretype(void){ return CORE_NEHALEM; } + case 11: + switch (model) { + case 7: // Raptor Lake + if(support_avx2()) + return CORE_HASWELL; + } case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; From 06b022b139c82c07a00f7b76e46c31b49b2cd728 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:42:36 +0100 Subject: [PATCH 049/154] Fix ReLAPACK source selection --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c92356e7..e830589e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,10 +212,10 @@ if(NOT NO_LAPACKE) add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) list(APPEND TARGET_OBJS "$") endif() -if(BUILD_RELAPACK) - add_library(RELAPACK OBJECT ${RELA_SOURCES}) - list(APPEND TARGET_OBJS "$") -endif() +#if(BUILD_RELAPACK) +# add_library(RELAPACK OBJECT ${RELA_SOURCES}) +# list(APPEND TARGET_OBJS "$") +#endif() set(OpenBLAS_LIBS "") if(BUILD_STATIC_LIBS) add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) From eeebaf22948192c151c87903865de603e93f2874 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:45:54 +0100 Subject: [PATCH 050/154] move INCLUDE_ALL to (c)make options --- relapack/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relapack/config.h b/relapack/config.h index 9d6919463..914efcbf0 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -45,7 +45,7 @@ // The following macros specify which routines are included in the library under // LAPACK's symbol names: 1 included, 0 not included -#define INCLUDE_ALL 1 +// #define INCLUDE_ALL 1 #define INCLUDE_XLAUUM INCLUDE_ALL #define INCLUDE_SLAUUM INCLUDE_XLAUUM From a082d54035d1e32db2dea16c74013c6ae6dc056d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:47:01 +0100 Subject: [PATCH 051/154] Rename to avoid conflict with OpenBLAS' toplevel config.h --- relapack/{config.h => relapack_config.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename relapack/{config.h => relapack_config.h} (100%) diff --git a/relapack/config.h b/relapack/relapack_config.h similarity index 100% rename from relapack/config.h rename to relapack/relapack_config.h From 3ebf5d219d41f0613f08ba89e9998ae8333d6118 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:49:07 +0100 Subject: [PATCH 052/154] handle INCLUDE_ALL and optional function prefixes --- relapack/Makefile | 86 ++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/relapack/Makefile b/relapack/Makefile index ddf101bd1..056a0ee48 100644 --- a/relapack/Makefile +++ b/relapack/Makefile @@ -1,53 +1,61 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system - +ifeq ($(RELAPACK_REPLACE),0) +RELAPREFIX=RELAPACK_ +INCLALL=-DINCLUDE_ALL=0 +else +INCLALL=-DINCLUDE_ALL=1 +endif SRC = $(wildcard src/*.c) SRC1 = \ - src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ - src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ - src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ - src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c + slauum.c clauum.c dlauum.c zlauum.c \ + strtri.c dtrtri.c ctrtri.c ztrtri.c \ + spotrf.c dpotrf.c cpotrf.c zpotrf.c \ + sgetrf.c dgetrf.c cgetrf.c zgetrf.c SRC2 = \ - src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ - src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ - src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ - src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ - src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ - src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ - src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ - src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ - src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c + cgbtrf.c cpbtrf.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \ + cgemmt.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \ + csytrf.c dgemmt.c dsytrf_rook_rec2.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \ + chegst.c csytrf_rec2.c dtgsyl.c strsyl.c zhetrf_rook.c ztgsyl.c \ + chetrf.c csytrf_rook.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \ + chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c ztrsyl_rec2.c \ + chetrf_rook.c ctgsyl.c ssygst.c zgbtrf.c zpbtrf.c \ + chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c \ + ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zsytrf.c SRCX = \ - src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ - src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ - src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ - src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ - src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ - src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ - src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ - src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ - src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c - -OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) -OBJS2 = $(SRC2:%.c=%.o) + cgbtrf.c cpbtrf.c ctrtri.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \ + cgemmt.c cpotrf.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \ + cgetrf.c csytrf.c dgemmt.c dsytrf_rook_rec2.c sgetrf.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \ + chegst.c csytrf_rec2.c dgetrf.c dtgsyl.c slauum.c strsyl.c zhetrf_rook.c ztgsyl.c \ + chetrf.c csytrf_rook.c dlauum.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \ + chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c spotrf.c strtri.c zlauum.c ztrsyl_rec2.c \ + chetrf_rook.c ctgsyl.c dpotrf.c dtrtri.c ssygst.c zgbtrf.c zpbtrf.c ztrtri.c \ + chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c zpotrf.c \ + clauum.c ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zgetrf.c zsytrf.c + + +OBJS1 = $(SRC1:%.c=src/$(RELAPREFIX)%.$(SUFFIX)) +OBJS2 = $(SRC2:%.c=src/$(RELAPREFIX)%.o) OBJS = $(OBJS1) $(OBJS2) TEST_SUITS = \ - slauum dlauum clauum zlauum \ - spotrf dpotrf cpotrf zpotrf \ - spbtrf dpbtrf cpbtrf zpbtrf \ - ssygst dsygst chegst zhegst \ - ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ - sgetrf dgetrf cgetrf zgetrf \ - sgbtrf dgbtrf cgbtrf zgbtrf \ - strsyl dtrsyl ctrsyl ztrsyl \ - stgsyl dtgsyl ctgsyl ztgsyl \ sgemmt dgemmt cgemmt zgemmt + + # slauum dlauum clauum zlauum \ + # spotrf dpotrf cpotrf zpotrf \ + # spbtrf dpbtrf cpbtrf zpbtrf \ + # ssygst dsygst chegst zhegst \ + # ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ + # sgetrf dgetrf cgetrf zgetrf \ + # sgbtrf dgbtrf cgbtrf zgbtrf \ + # strsyl dtrsyl ctrsyl ztrsyl \ + # stgsyl dtgsyl ctgsyl ztgsyl \ + TESTS = $(TEST_SUITS:%=test/%.pass) # dummies TEST_EXES = $(TEST_SUITS:%=test/%.x) @@ -63,11 +71,11 @@ libs: $(OBJS) $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) $(RANLIB) $(TOPDIR)/$(LIBNAME) -%.$(SUFFIX): %.c config.h - $(CC) $(CFLAGS) -c $< -o $@ +src/$(RELAPREFIX)%.$(SUFFIX): src/%.c relapack_config.h + $(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@ -%.o: %.c config.h - $(CC) $(CFLAGS) -c $< -o $@ +src/$(RELAPREFIX)%.o: src/%.c relapack_config.h + $(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@ # ReLAPACK testing From ce7ea72de101707c1c0b8b4f9830e6dc7d25a44b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:50:51 +0100 Subject: [PATCH 053/154] Fix include paths --- relapack/src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt index 2d861f54b..78fb1431f 100644 --- a/relapack/src/CMakeLists.txt +++ b/relapack/src/CMakeLists.txt @@ -1,5 +1,6 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) +include_directories(${PROJECT_SOURCE_DIR}/relapack) set(RELAFILES clauum.c From d39978cd7ff702e2a9d3df439814a7a8a511deb0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:53:19 +0100 Subject: [PATCH 054/154] Fix includes --- relapack/src/ctrsyl_rec2.c | 2 +- relapack/src/relapack.h | 4 ++-- relapack/src/ztrsyl_rec2.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c index 556491c7a..674d73709 100644 --- a/relapack/src/ctrsyl_rec2.c +++ b/relapack/src/ctrsyl_rec2.c @@ -10,7 +10,7 @@ http://www.netlib.org/f2c/libf2c.zip */ -#include "../config.h" +#include "relapack_config.h" #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h index 38c5c30d0..44652a074 100644 --- a/relapack/src/relapack.h +++ b/relapack/src/relapack.h @@ -1,7 +1,7 @@ #ifndef RELAPACK_INT_H #define RELAPACK_INT_H #include -#include "../../config.h" +#include "config.h" #if defined(OS_WINDOWS) && defined(__64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; @@ -9,7 +9,7 @@ typedef unsigned long long BLASULONG; typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#include "../config.h" +#include "relapack_config.h" #include "../inc/relapack.h" diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c index edc6ffc6b..d07a4e8de 100644 --- a/relapack/src/ztrsyl_rec2.c +++ b/relapack/src/ztrsyl_rec2.c @@ -10,7 +10,7 @@ http://www.netlib.org/f2c/libf2c.zip */ -#include "../config.h" +#include "relapack_config.h" #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES From ea6c5f3cf553a23f8e2e787307805e7874e1f9c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Oct 2022 12:55:23 +0100 Subject: [PATCH 055/154] Add option RELAPACK_REPLACE --- Makefile.rule | 5 ++++- Makefile.system | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a0ad90a68..9665d951a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 +# Have RecursiveLAPACK actually replace standard LAPACK routines instead of +# just adding its equivalents with a RELAPACK_ prefix +# RELAPACK_REPLACE = 1 # If you want to use the legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -207,7 +210,7 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 -# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). +# If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute diff --git a/Makefile.system b/Makefile.system index 10b952d4b..3c29ab3f3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,10 @@ ifndef TOPDIR TOPDIR = . endif +ifndef RELAPACK_REPLACE +RELAPACK_REPLACE=0 +endif + # we need to use the host system's architecture for getarch compile options even especially when cross-compiling HOSTARCH := $(shell uname -m) ifeq ($(HOSTARCH), amd64) From c9d78dc3b2d938a27a372fdad3b376397bf52da3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Oct 2022 16:57:03 +0100 Subject: [PATCH 056/154] Remove excess initializer (leftover from rework of PR 3793) --- kernel/setparam-ref.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 16d19af1b..522c6d7d9 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -870,7 +870,6 @@ gotoblas_t TABLE_NAME = { #if BUILD_COMPLEX16==1 zgeadd_kTS, #endif - 1, // align_k }; #if (ARCH_ARM64) From c970717157ff601a3e53e7b2f60ae2ec467799c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Nov 2022 13:51:20 +0100 Subject: [PATCH 057/154] fix missing t in xgemmt rule Co-authored-by: Alexis <35051714+amontoison@users.noreply.github.com> --- interface/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/Makefile b/interface/Makefile index a1f4f66da..6f320d8f7 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1337,7 +1337,7 @@ cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -xgemmt.$(SUFFIX) xgemm.$(PSUFFIX) : gemmt.c ../param.h +xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c From da6e426b13d4c7ad7ac066dd462c25f66c564322 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Nov 2022 18:13:35 +0100 Subject: [PATCH 058/154] fix Cooperlake not selectable via environment variable --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9a693b06f..f61930983 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 24; i++) + for ( i=1 ; i <= 25; i++) { if (!strncasecmp(coretype,corename[i],20)) { From fcda11c1ae0c50d5ab393352d8b78084a4e1dcad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Nov 2022 23:48:50 +0100 Subject: [PATCH 059/154] Revert special handling of GEMMT --- relapack/relapack_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relapack/relapack_config.h b/relapack/relapack_config.h index 914efcbf0..ba428a61b 100644 --- a/relapack/relapack_config.h +++ b/relapack/relapack_config.h @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 1 +#define INCLUDE_XGEMMT INCLUDE_ALL #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT From 1b777641825f9f97f2fb0a3386d32e1d106c36db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Nov 2022 12:02:59 +0100 Subject: [PATCH 060/154] Conditionally leave out bits of LAPACK to be overridden by ReLAPACK --- interface/CMakeLists.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 0b2998237..4e082928b 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c - trsm.c syrk.c syr2k.c + trsm.c syrk.c syr2k.c gemmt.c ) set(BLAS3_MANGLED_SOURCES @@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK) ) GenerateNamedObjects("${LAPACK_SOURCES}") + if (NOT RELAPACK_REPLACE) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) + else () + GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) + endif() endif () if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) From aa2a2d9c01357befb2d168d6833332b3dc50f008 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Nov 2022 12:04:46 +0100 Subject: [PATCH 061/154] Conditionally compile files that may get replaced by ReLAPACK --- lapack/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index fd4e57048..1d44e9490 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -39,8 +39,12 @@ set(UNIT_SOURCES2 trti2/trti2_L.c ) +if (NOT RELAPACK_REPLACE) GenerateNamedObjects("${LAPACK_SOURCES}") GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) +else() +GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) +endif() GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) @@ -113,4 +117,3 @@ GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) add_library(lapack OBJECT ${OPENBLAS_SRC}) - From 2e64722681cd94ec3f7c077ee3f96c5350ddc352 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Nov 2022 16:20:17 +0100 Subject: [PATCH 062/154] Update Makefile.rule --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 9665d951a..5e6cefc22 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -210,7 +210,7 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 -# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). # CONSISTENT_FPCSR = 1 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute From e6204d254f1ef1ca8524f7d82ceaf31cbe63c17b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Nov 2022 16:21:11 +0100 Subject: [PATCH 063/154] Update CMakeLists.txt --- interface/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 4e082928b..ce1434a90 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c - trsm.c syrk.c syr2k.c gemmt.c + trsm.c syrk.c syr2k.c ) set(BLAS3_MANGLED_SOURCES From 1865b152403661a99fb4b99f2c94ad0d88629651 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Nov 2022 10:31:30 +0100 Subject: [PATCH 064/154] Add fallbacks to RaptorLake entry --- cpuid_x86.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 357376f42..4afa931f0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1549,6 +1549,10 @@ int get_cpuname(void){ case 7: // Raptor Lake if(support_avx2()) return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -2344,8 +2348,14 @@ int get_coretype(void){ case 11: switch (model) { case 7: // Raptor Lake - if(support_avx2()) +#ifndef NO_AVX2 + if(support_avx2()) return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; } case 15: if (model <= 0x2) return CORE_NORTHWOOD; From c957ad684ed6b8ca64f332221b376f2ad0fdc51a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Nov 2022 10:46:43 +0100 Subject: [PATCH 065/154] Bump gcc requirement for NeoverseN2 and V1 to 10.4 --- Makefile.arm64 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 480684422..e2c471c2b 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -89,11 +89,11 @@ endif endif # Use a72 tunings because Neoverse-V1 is only available -# in GCC>=9.4 +# in GCC>=10.4 ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 @@ -119,11 +119,11 @@ endif endif # Use a72 tunings because Neoverse-N2 is only available -# in GCC>=9.4 +# in GCC>=10.4 ifeq ($(CORE), NEOVERSEN2) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else From be546ec1ad283e8543a9a2ff181a019b6a753d26 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Nov 2022 11:00:41 +0100 Subject: [PATCH 066/154] Add gcc options for Neoverse cpus --- cmake/cc.cmake | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 57e42781d..62278c4a7 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -155,6 +155,39 @@ if (${CORE} STREQUAL A64FX) endif () endif () +if (${CORE} STREQUAL NEOVERSEN2) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL NEOVERSEV1) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL NEOVERSEN1) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") From 6c1043eb41caaa1bda2f81a377316d191fd9947a Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Wed, 9 Nov 2022 08:28:23 -0500 Subject: [PATCH 067/154] Add [cz]scal microkernels for SKYLAKEX These are as similar to dscal_microk_skylakex-2.c as possible for consistency. Note that before this change SKYLAKEX+ uses generic C functions for cscal/zscal via commit 2271c350 from #2610 (which is masked by commit 086d87a30). However now #3799 disables FMAs (in turn enabled by `-march=skylake-avx512`) in the plain C code which fixes excessive LAPACK test failures more nicely. --- kernel/x86_64/KERNEL.SKYLAKEX | 3 - kernel/x86_64/cscal.c | 4 +- kernel/x86_64/cscal_microk_skylakex-2.c | 152 ++++++++++++++++++++++++ kernel/x86_64/zscal.c | 4 +- kernel/x86_64/zscal_microk_skylakex-2.c | 152 ++++++++++++++++++++++++ 5 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 kernel/x86_64/cscal_microk_skylakex-2.c create mode 100644 kernel/x86_64/zscal_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index cb6f62981..548e5dcfc 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -44,8 +44,5 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c -CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c - CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 6ae66d973..95a99b8b9 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -41,7 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "cscal_microk_skylakex-2.c" +#elif defined(HASWELL) || defined(ZEN) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c new file mode 100644 index 000000000..8a622427b --- /dev/null +++ b/kernel/x86_64/cscal_microk_skylakex-2.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + /* _mm512_addsub_ps does not exist so we flip signs for odd elements of da_i */ + __m512 da_r = _mm512_set1_ps(alpha[0]); + __m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1); + for (; i < n2; i += 32) { + __m512 x0 = _mm512_loadu_ps(&x[i + 0]); + __m512 x1 = _mm512_loadu_ps(&x[i + 16]); + __m512 y0 = _mm512_permute_ps(x0, 0xb1); + __m512 y1 = _mm512_permute_ps(x1, 0xb1); + _mm512_storeu_ps(&x[i + 0], _mm512_add_ps(da_r * x0, da_i * y0)); + _mm512_storeu_ps(&x[i + 16], _mm512_add_ps(da_r * x1, da_i * y1)); + } +#else + __m256 da_r = _mm256_set1_ps(alpha[0]); + __m256 da_i = _mm256_set1_ps(alpha[1]); + for (; i < n2; i += 32) { + __m256 x0 = _mm256_loadu_ps(&x[i + 0]); + __m256 x1 = _mm256_loadu_ps(&x[i + 8]); + __m256 x2 = _mm256_loadu_ps(&x[i + 16]); + __m256 x3 = _mm256_loadu_ps(&x[i + 24]); + __m256 y0 = _mm256_permute_ps(x0, 0xb1); + __m256 y1 = _mm256_permute_ps(x1, 0xb1); + __m256 y2 = _mm256_permute_ps(x2, 0xb1); + __m256 y3 = _mm256_permute_ps(x3, 0xb1); + _mm256_storeu_ps(&x[i + 0], _mm256_addsub_ps(da_r * x0, da_i * y0)); + _mm256_storeu_ps(&x[i + 8], _mm256_addsub_ps(da_r * x1, da_i * y1)); + _mm256_storeu_ps(&x[i + 16], _mm256_addsub_ps(da_r * x2, da_i * y2)); + _mm256_storeu_ps(&x[i + 24], _mm256_addsub_ps(da_r * x3, da_i * y3)); + } +#endif +} + + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1); + for (; i < n2; i += 32) { + __m512 y0 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 0]), 0xb1); + __m512 y1 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 16]), 0xb1); + _mm512_storeu_ps(&x[i + 0], da_i * y0); + _mm512_storeu_ps(&x[i + 16], da_i * y1); + } +#else + __m256 da_i = _mm256_set1_ps(alpha[1]) * _mm256_set_ps(1, -1, 1, -1, 1, -1, 1, -1); + for (; i < n2; i += 32) { + __m256 y0 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 0]), 0xb1); + __m256 y1 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 8]), 0xb1); + __m256 y2 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 16]), 0xb1); + __m256 y3 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 24]), 0xb1); + _mm256_storeu_ps(&x[i + 0], da_i * y0); + _mm256_storeu_ps(&x[i + 8], da_i * y1); + _mm256_storeu_ps(&x[i + 16], da_i * y2); + _mm256_storeu_ps(&x[i + 24], da_i * y3); + } +#endif +} + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512 da_r = _mm512_set1_ps(alpha[0]); + for (; i < n2; i += 32) { + _mm512_storeu_ps(&x[i + 0], da_r * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&x[i + 16], da_r * _mm512_loadu_ps(&x[i + 16])); + } +#else + __m256 da_r = _mm256_set1_ps(alpha[0]); + for (; i < n2; i += 32) { + _mm256_storeu_ps(&x[i + 0], da_r * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&x[i + 8], da_r * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&x[i + 16], da_r * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&x[i + 24], da_r * _mm256_loadu_ps(&x[i + 24])); + } +#endif +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512 zero = _mm512_setzero_ps(); + for (; i < n2; i += 32) { + _mm512_storeu_ps(&x[i], zero); + _mm512_storeu_ps(&x[i + 16], zero); + } +#else + __m256 zero = _mm256_setzero_ps(); + for (; i < n2; i += 32) { + _mm256_storeu_ps(&x[i + 0], zero); + _mm256_storeu_ps(&x[i + 8], zero); + _mm256_storeu_ps(&x[i + 16], zero); + _mm256_storeu_ps(&x[i + 24], zero); + } +#endif + +} + +#else +#include "cscal_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index dfdb4230b..45e3531b8 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -41,7 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "zscal_microk_skylakex-2.c" +#elif defined(HASWELL) || defined(ZEN) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c new file mode 100644 index 000000000..f9e05e333 --- /dev/null +++ b/kernel/x86_64/zscal_microk_skylakex-2.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + /* _mm512_addsub_pd does not exist so we flip signs for odd elements of da_i */ + __m512d da_r = _mm512_set1_pd(alpha[0]); + __m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m512d x0 = _mm512_loadu_pd(&x[i + 0]); + __m512d x1 = _mm512_loadu_pd(&x[i + 8]); + __m512d y0 = _mm512_permute_pd(x0, 0x55); + __m512d y1 = _mm512_permute_pd(x1, 0x55); + _mm512_storeu_pd(&x[i + 0], _mm512_add_pd(da_r * x0, da_i * y0)); + _mm512_storeu_pd(&x[i + 8], _mm512_add_pd(da_r * x1, da_i * y1)); + } +#else + __m256d da_r = _mm256_set1_pd(alpha[0]); + __m256d da_i = _mm256_set1_pd(alpha[1]); + for (; i < n2; i += 16) { + __m256d x0 = _mm256_loadu_pd(&x[i + 0]); + __m256d x1 = _mm256_loadu_pd(&x[i + 4]); + __m256d x2 = _mm256_loadu_pd(&x[i + 8]); + __m256d x3 = _mm256_loadu_pd(&x[i + 12]); + __m256d y0 = _mm256_permute_pd(x0, 0x05); + __m256d y1 = _mm256_permute_pd(x1, 0x05); + __m256d y2 = _mm256_permute_pd(x2, 0x05); + __m256d y3 = _mm256_permute_pd(x3, 0x05); + _mm256_storeu_pd(&x[i + 0], _mm256_addsub_pd(da_r * x0, da_i * y0)); + _mm256_storeu_pd(&x[i + 4], _mm256_addsub_pd(da_r * x1, da_i * y1)); + _mm256_storeu_pd(&x[i + 8], _mm256_addsub_pd(da_r * x2, da_i * y2)); + _mm256_storeu_pd(&x[i + 12], _mm256_addsub_pd(da_r * x3, da_i * y3)); + } +#endif +} + + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m512d y0 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 0]), 0x55); + __m512d y1 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 8]), 0x55); + _mm512_storeu_pd(&x[i + 0], da_i * y0); + _mm512_storeu_pd(&x[i + 8], da_i * y1); + } +#else + __m256d da_i = _mm256_set1_pd(alpha[1]) * _mm256_set_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m256d y0 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 0]), 0x05); + __m256d y1 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 8]), 0x05); + __m256d y2 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 16]), 0x05); + __m256d y3 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 24]), 0x05); + _mm256_storeu_pd(&x[i + 0], da_i * y0); + _mm256_storeu_pd(&x[i + 4], da_i * y1); + _mm256_storeu_pd(&x[i + 8], da_i * y2); + _mm256_storeu_pd(&x[i + 12], da_i * y3); + } +#endif +} + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512d da_r = _mm512_set1_pd(alpha[0]); + for (; i < n2; i += 16) { + _mm512_storeu_pd(&x[i + 0], da_r * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&x[i + 8], da_r * _mm512_loadu_pd(&x[i + 8])); + } +#else + __m256d da_r = _mm256_set1_pd(alpha[0]); + for (; i < n2; i += 16) { + _mm256_storeu_pd(&x[i + 0], da_r * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], da_r * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&x[i + 8], da_r * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&x[i + 12], da_r * _mm256_loadu_pd(&x[i + 12])); + } +#endif +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n2; i += 16) { + _mm512_storeu_pd(&x[i], zero); + _mm512_storeu_pd(&x[i + 8], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n2; i += 16) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + _mm256_storeu_pd(&x[i + 8], zero); + _mm256_storeu_pd(&x[i + 12], zero); + } +#endif + +} + +#else +#include "zscal_microk_haswell-2.c" +#endif From 09dd90ca09cd61d14afa8b2f63fa7c250154ff07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Nov 2022 15:35:57 +0100 Subject: [PATCH 068/154] Limit cpu models in OSX_dynarch_cmake --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2de6ec6ba..16b9da4f5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -204,7 +204,7 @@ jobs: - script: | mkdir build cd build - cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. cmake --build . ctest From d141cf341f4e2c80f47e76296439335ded59a356 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Nov 2022 20:31:30 +0100 Subject: [PATCH 069/154] Increase the wait time for ppc jobs again --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a4edad726..06db6a95c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,7 +30,7 @@ matrix: before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" script: - - travis_wait 40 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -104,7 +104,7 @@ matrix: - sudo apt-get update - sudo apt-get install gcc-9 gfortran-9 -y script: - - travis_wait 40 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - travis_wait 50 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -121,7 +121,7 @@ matrix: - sudo apt-get update - sudo apt-get install gcc-9 gfortran-9 -y script: - - travis_wait 40 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - travis_wait 50 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE From cb48c29b6f43f1da0a4bb24c9ec1f7add06996c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 12:49:59 +0100 Subject: [PATCH 070/154] Fix workspace calculation (Reference-LAPACK PR690) --- lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f | 17 ++++++++++++----- lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f | 17 ++++++++++++----- lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f | 17 ++++++++++++----- lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f | 17 ++++++++++++----- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f index 369ed1983..46eaf33b9 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f index be5720f4f..55cab8b23 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f index bff973214..d2ad13ced 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f index 79e86b41b..623b88a8a 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * From 3e2d52c502fc85c29c25a2e49ab8a46d4b1bebc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 13:00:52 +0100 Subject: [PATCH 071/154] Fix workspace calculation in GEQRF/GERQF (Reference-LAPACK PR 638) --- lapack-netlib/SRC/sgeqrf.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f index f47d8bf32..b24615f7a 100644 --- a/lapack-netlib/SRC/sgeqrf.f +++ b/lapack-netlib/SRC/sgeqrf.f @@ -204,7 +204,7 @@ END IF * * Quick return if possible -* +* IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN From 6dcf737c5d9c8404ec7a4fda5a4958bdf669d418 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 14:51:39 +0100 Subject: [PATCH 072/154] Add NaN check functions for trapezoidal matrices (Reference-LAPACK PR738+742) --- lapack-netlib/LAPACKE/include/lapacke_utils.h | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index f84604e8a..332a5024f 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -68,7 +68,7 @@ void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ lapack_logical LAPACKE_lsame( char ca, char cb ) #if defined __GNUC__ - __attribute__((const)) + __attribute__((const)) #endif ; @@ -128,6 +128,10 @@ void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_float *in, lapack_int ldin, lapack_complex_float *out, lapack_int ldout ); +void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *in, lapack_int ldin, + lapack_complex_float *out, lapack_int ldout ); void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -178,6 +182,10 @@ void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const double *in, lapack_int ldin, double *out, lapack_int ldout ); +void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *in, lapack_int ldin, + double *out, lapack_int ldout ); void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -228,6 +236,10 @@ void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n, const float *in, lapack_int ldin, float *out, lapack_int ldout ); +void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *in, lapack_int ldin, + float *out, lapack_int ldout ); void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -284,6 +296,10 @@ void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_double *in, lapack_int ldin, lapack_complex_double *out, lapack_int ldout ); +void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *in, lapack_int ldin, + lapack_complex_double *out, lapack_int ldout ); /* NaN checkers */ #define LAPACK_SISNAN( x ) ( x != x ) @@ -376,6 +392,10 @@ lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_float *a, lapack_int lda ); +lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *a, + lapack_int lda ); lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -440,6 +460,9 @@ lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const double *a, lapack_int lda ); +lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *a, lapack_int lda ); lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -504,6 +527,9 @@ lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const float *a, lapack_int lda ); +lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *a, lapack_int lda ); lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -574,6 +600,10 @@ lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_double *a, lapack_int lda ); +lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *a, + lapack_int lda ); #ifdef __cplusplus } From 23cfe58ee37a382a621cef75e12b6bc64e0d6a84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 14:55:45 +0100 Subject: [PATCH 073/154] Add NaN check functions for trapezoidal matrices (Reference-LAPACK PR738+742) --- lapack-netlib/LAPACKE/src/lapacke_clantr.c | 6 +- lapack-netlib/LAPACKE/src/lapacke_clarfb.c | 68 ++++++------------- .../LAPACKE/src/lapacke_clarfb_work.c | 57 +++++----------- lapack-netlib/LAPACKE/src/lapacke_dlantr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfb.c | 68 ++++++------------- .../LAPACKE/src/lapacke_dlarfb_work.c | 57 +++++----------- lapack-netlib/LAPACKE/src/lapacke_slantr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfb.c | 68 ++++++------------- .../LAPACKE/src/lapacke_slarfb_work.c | 57 +++++----------- lapack-netlib/LAPACKE/src/lapacke_zlantr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfb.c | 68 ++++++------------- .../LAPACKE/src/lapacke_zlarfb_work.c | 57 +++++----------- 12 files changed, 150 insertions(+), 362 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr.c b/lapack-netlib/LAPACKE/src/lapacke_clantr.c index 88e765f2b..e00b6c578 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr.c @@ -33,8 +33,8 @@ #include "lapacke_utils.h" float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, - lapack_int m, lapack_int n, const lapack_complex_float* a, - lapack_int lda ) + lapack_int m, lapack_int n, const lapack_complex_float* a, + lapack_int lda ) { lapack_int info = 0; float res = 0.; @@ -46,7 +46,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_ctz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index ccd34cecd..8b1492bec 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -42,7 +42,9 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; lapack_complex_float* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_clarfb", -1 ); return -1; @@ -50,59 +52,27 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); + return -8; + } + if( LAPACKE_ctz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_cge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); - return -8; - } - if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); - return -8; - } - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c index 3ad97c22d..90ff0851f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c @@ -42,6 +42,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; lapack_complex_float *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -52,16 +54,14 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -81,6 +81,11 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_clarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_clarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (lapack_complex_float*) LAPACKE_malloc( sizeof(lapack_complex_float) * @@ -102,36 +107,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 ); - return -8; - } - LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 ); - return -8; - } - LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_ctz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_cge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c index 4d1be93d7..b20af0eb4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c @@ -46,7 +46,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_dtz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index 3c3c24c54..82e8fae52 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -41,7 +41,9 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; double* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -1 ); return -1; @@ -49,59 +51,27 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); + return -8; + } + if( LAPACKE_dtz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_dge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); - return -8; - } - if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); - return -8; - } - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c index 57c53bae3..1a68bf762 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c @@ -41,6 +41,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; double *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -51,16 +53,14 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -80,6 +80,11 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_dlarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dlarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (double*) LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,ncols_v) ); @@ -98,36 +103,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 ); - return -8; - } - LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 ); - return -8; - } - LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_dtz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_dge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr.c b/lapack-netlib/LAPACKE/src/lapacke_slantr.c index 2f4c65889..e2f67cfd6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr.c @@ -46,7 +46,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_str_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_stz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index 37d51dee5..892648f4b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -41,7 +41,9 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; float* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_slarfb", -1 ); return -1; @@ -49,59 +51,27 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); + return -8; + } + if( LAPACKE_stz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_sge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); - return -8; - } - if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); - return -8; - } - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c index 2f5d61676..d805a947a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c @@ -41,6 +41,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; float *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -51,16 +53,14 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -80,6 +80,11 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_slarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_slarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,ncols_v) ); if( v_t == NULL ) { @@ -97,36 +102,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_str_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 ); - return -8; - } - LAPACKE_str_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_str_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 ); - return -8; - } - LAPACKE_str_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_stz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_sge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c index f6656d84d..4c078b9b0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c @@ -46,7 +46,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_ztz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 7cd23dde8..25cedb506 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -42,7 +42,9 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; lapack_complex_double* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -1 ); return -1; @@ -50,59 +52,27 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); + return -8; + } + if( LAPACKE_ztz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_zge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); - return -8; - } - if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); - return -8; - } - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c index 1b4f892a1..64eb05263 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c @@ -42,6 +42,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; lapack_complex_double *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -52,16 +54,14 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -81,6 +81,11 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_zlarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zlarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (lapack_complex_double*) LAPACKE_malloc( sizeof(lapack_complex_double) * @@ -102,36 +107,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 ); - return -8; - } - LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 ); - return -8; - } - LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_ztz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_zge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ From eba1112e38f774e7eca7300fe37cfc6a5f36f009 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 15:03:39 +0100 Subject: [PATCH 074/154] Add NaN check functions for trapezoidal matrices (Reference-LAPACK PR738+742) --- lapack-netlib/LAPACKE/utils/CMakeLists.txt | 81 +++++----- lapack-netlib/LAPACKE/utils/Makefile | 8 + .../LAPACKE/utils/lapacke_ctz_nancheck.c | 144 +++++++++++++++++ .../LAPACKE/utils/lapacke_ctz_trans.c | 153 ++++++++++++++++++ .../LAPACKE/utils/lapacke_dtz_nancheck.c | 143 ++++++++++++++++ .../LAPACKE/utils/lapacke_dtz_trans.c | 153 ++++++++++++++++++ .../LAPACKE/utils/lapacke_stz_nancheck.c | 143 ++++++++++++++++ .../LAPACKE/utils/lapacke_stz_trans.c | 153 ++++++++++++++++++ .../LAPACKE/utils/lapacke_ztz_nancheck.c | 144 +++++++++++++++++ .../LAPACKE/utils/lapacke_ztz_trans.c | 153 ++++++++++++++++++ 10 files changed, 1238 insertions(+), 37 deletions(-) create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c diff --git a/lapack-netlib/LAPACKE/utils/CMakeLists.txt b/lapack-netlib/LAPACKE/utils/CMakeLists.txt index dd36ee33e..dfb9aa370 100644 --- a/lapack-netlib/LAPACKE/utils/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/utils/CMakeLists.txt @@ -1,39 +1,46 @@ set(UTILS -lapacke_c_nancheck.c lapacke_ctr_trans.c lapacke_make_complex_float.c lapacke_zgb_nancheck.c -lapacke_cgb_nancheck.c lapacke_d_nancheck.c lapacke_s_nancheck.c lapacke_zgb_trans.c -lapacke_cgb_trans.c lapacke_dgb_nancheck.c lapacke_sgb_nancheck.c lapacke_zge_nancheck.c -lapacke_cge_nancheck.c lapacke_dgb_trans.c lapacke_sgb_trans.c lapacke_zge_trans.c -lapacke_cge_trans.c lapacke_dge_nancheck.c lapacke_sge_nancheck.c lapacke_zgg_nancheck.c -lapacke_cgg_nancheck.c lapacke_dge_trans.c lapacke_sge_trans.c lapacke_zgg_trans.c -lapacke_cgg_trans.c lapacke_dgg_nancheck.c lapacke_sgg_nancheck.c lapacke_zgt_nancheck.c -lapacke_cgt_nancheck.c lapacke_dgg_trans.c lapacke_sgg_trans.c lapacke_zhb_nancheck.c -lapacke_chb_nancheck.c lapacke_dgt_nancheck.c lapacke_sgt_nancheck.c lapacke_zhb_trans.c -lapacke_chb_trans.c lapacke_dhs_nancheck.c lapacke_shs_nancheck.c lapacke_zhe_nancheck.c -lapacke_che_nancheck.c lapacke_dhs_trans.c lapacke_shs_trans.c lapacke_zhe_trans.c -lapacke_che_trans.c lapacke_dpb_nancheck.c lapacke_spb_nancheck.c lapacke_zhp_nancheck.c -lapacke_chp_nancheck.c lapacke_dpb_trans.c lapacke_spb_trans.c lapacke_zhp_trans.c -lapacke_chp_trans.c lapacke_dpf_nancheck.c lapacke_spf_nancheck.c lapacke_zhs_nancheck.c -lapacke_chs_nancheck.c lapacke_dpf_trans.c lapacke_spf_trans.c lapacke_zhs_trans.c -lapacke_chs_trans.c lapacke_dpo_nancheck.c lapacke_spo_nancheck.c lapacke_zpb_nancheck.c -lapacke_cpb_nancheck.c lapacke_dpo_trans.c lapacke_spo_trans.c lapacke_zpb_trans.c -lapacke_cpb_trans.c lapacke_dpp_nancheck.c lapacke_spp_nancheck.c lapacke_zpf_nancheck.c -lapacke_cpf_nancheck.c lapacke_dpp_trans.c lapacke_spp_trans.c lapacke_zpf_trans.c -lapacke_cpf_trans.c lapacke_dpt_nancheck.c lapacke_spt_nancheck.c lapacke_zpo_nancheck.c -lapacke_cpo_nancheck.c lapacke_dsb_nancheck.c lapacke_ssb_nancheck.c lapacke_zpo_trans.c -lapacke_cpo_trans.c lapacke_dsb_trans.c lapacke_ssb_trans.c lapacke_zpp_nancheck.c -lapacke_cpp_nancheck.c lapacke_dsp_nancheck.c lapacke_ssp_nancheck.c lapacke_zpp_trans.c -lapacke_cpp_trans.c lapacke_dsp_trans.c lapacke_ssp_trans.c lapacke_zpt_nancheck.c -lapacke_cpt_nancheck.c lapacke_dst_nancheck.c lapacke_sst_nancheck.c lapacke_zsp_nancheck.c -lapacke_csp_nancheck.c lapacke_dsy_nancheck.c lapacke_ssy_nancheck.c lapacke_zsp_trans.c -lapacke_csp_trans.c lapacke_dsy_trans.c lapacke_ssy_trans.c lapacke_zst_nancheck.c -lapacke_cst_nancheck.c lapacke_dtb_nancheck.c lapacke_stb_nancheck.c lapacke_zsy_nancheck.c -lapacke_csy_nancheck.c lapacke_dtb_trans.c lapacke_stb_trans.c lapacke_zsy_trans.c -lapacke_csy_trans.c lapacke_dtf_nancheck.c lapacke_stf_nancheck.c lapacke_ztb_nancheck.c -lapacke_ctb_nancheck.c lapacke_dtf_trans.c lapacke_stf_trans.c lapacke_ztb_trans.c -lapacke_ctb_trans.c lapacke_dtp_nancheck.c lapacke_stp_nancheck.c lapacke_ztf_nancheck.c -lapacke_ctf_nancheck.c lapacke_dtp_trans.c lapacke_stp_trans.c lapacke_ztf_trans.c -lapacke_ctf_trans.c lapacke_dtr_nancheck.c lapacke_str_nancheck.c lapacke_ztp_nancheck.c -lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c -lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c -lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c +lapacke_c_nancheck.c lapacke_d_nancheck.c lapacke_s_nancheck.c lapacke_z_nancheck.c +lapacke_cgb_nancheck.c lapacke_dgb_nancheck.c lapacke_sgb_nancheck.c lapacke_zgb_trans.c +lapacke_cgb_trans.c lapacke_dgb_trans.c lapacke_sgb_trans.c lapacke_zgb_nancheck.c +lapacke_cge_nancheck.c lapacke_dge_nancheck.c lapacke_sge_nancheck.c lapacke_zge_nancheck.c +lapacke_cge_trans.c lapacke_dge_trans.c lapacke_sge_trans.c lapacke_zge_trans.c +lapacke_cgg_nancheck.c lapacke_dgg_nancheck.c lapacke_sgg_nancheck.c lapacke_zgg_nancheck.c +lapacke_cgg_trans.c lapacke_dgg_trans.c lapacke_sgg_trans.c lapacke_zgg_trans.c +lapacke_cgt_nancheck.c lapacke_dgt_nancheck.c lapacke_sgt_nancheck.c lapacke_zgt_nancheck.c +lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_ssb_nancheck.c lapacke_zhb_nancheck.c +lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_ssb_trans.c lapacke_zhb_trans.c +lapacke_che_nancheck.c lapacke_zhe_nancheck.c +lapacke_che_trans.c lapacke_zhe_trans.c +lapacke_chp_nancheck.c lapacke_zhp_nancheck.c +lapacke_chp_trans.c lapacke_zhp_trans.c +lapacke_chs_nancheck.c lapacke_dhs_nancheck.c lapacke_shs_nancheck.c lapacke_zhs_nancheck.c +lapacke_chs_trans.c lapacke_dhs_trans.c lapacke_shs_trans.c lapacke_zhs_trans.c +lapacke_cpb_nancheck.c lapacke_dpb_nancheck.c lapacke_spb_nancheck.c lapacke_zpb_nancheck.c +lapacke_cpb_trans.c lapacke_dpb_trans.c lapacke_spb_trans.c lapacke_zpb_trans.c +lapacke_cpf_nancheck.c lapacke_dpf_nancheck.c lapacke_spf_nancheck.c lapacke_zpf_nancheck.c +lapacke_cpf_trans.c lapacke_dpf_trans.c lapacke_spf_trans.c lapacke_zpf_trans.c +lapacke_cpo_nancheck.c lapacke_dpo_nancheck.c lapacke_spo_nancheck.c lapacke_zpo_nancheck.c +lapacke_cpo_trans.c lapacke_dpo_trans.c lapacke_spo_trans.c lapacke_zpo_trans.c +lapacke_cpp_nancheck.c lapacke_dpp_nancheck.c lapacke_spp_nancheck.c lapacke_zpp_nancheck.c +lapacke_cpp_trans.c lapacke_dpp_trans.c lapacke_spp_trans.c lapacke_zpp_trans.c +lapacke_cpt_nancheck.c lapacke_dpt_nancheck.c lapacke_spt_nancheck.c lapacke_zpt_nancheck.c +lapacke_csp_nancheck.c lapacke_dsp_nancheck.c lapacke_ssp_nancheck.c lapacke_zsp_nancheck.c +lapacke_csp_trans.c lapacke_dsp_trans.c lapacke_ssp_trans.c lapacke_zsp_trans.c +lapacke_cst_nancheck.c lapacke_dst_nancheck.c lapacke_sst_nancheck.c lapacke_zst_nancheck.c +lapacke_csy_nancheck.c lapacke_dsy_nancheck.c lapacke_ssy_nancheck.c lapacke_zsy_nancheck.c +lapacke_csy_trans.c lapacke_dsy_trans.c lapacke_ssy_trans.c lapacke_zsy_trans.c +lapacke_ctb_nancheck.c lapacke_dtb_nancheck.c lapacke_stb_nancheck.c lapacke_ztb_nancheck.c +lapacke_ctb_trans.c lapacke_dtb_trans.c lapacke_stb_trans.c lapacke_ztb_trans.c +lapacke_ctf_nancheck.c lapacke_dtf_nancheck.c lapacke_stf_nancheck.c lapacke_ztf_nancheck.c +lapacke_ctf_trans.c lapacke_dtf_trans.c lapacke_stf_trans.c lapacke_ztf_trans.c +lapacke_ctp_nancheck.c lapacke_dtp_nancheck.c lapacke_stp_nancheck.c lapacke_ztp_nancheck.c +lapacke_ctp_trans.c lapacke_dtp_trans.c lapacke_stp_trans.c lapacke_ztp_trans.c +lapacke_ctr_nancheck.c lapacke_dtr_nancheck.c lapacke_str_nancheck.c lapacke_ztr_nancheck.c +lapacke_ctr_trans.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztr_trans.c +lapacke_ctz_nancheck.c lapacke_dtz_nancheck.c lapacke_stz_nancheck.c lapacke_ztz_nancheck.c +lapacke_ctz_trans.c lapacke_dtz_trans.c lapacke_stz_trans.c lapacke_ztz_trans.c + +lapacke_make_complex_float.c lapacke_make_complex_double.c +lapacke_lsame.c +lapacke_xerbla.c ) diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index adc573650..a1f863107 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -76,6 +76,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_ctp_trans.o \ lapacke_ctr_nancheck.o \ lapacke_ctr_trans.o \ + lapacke_ctz_nancheck.o \ + lapacke_ctz_trans.o \ lapacke_dgb_nancheck.o \ lapacke_dgb_trans.o \ lapacke_dge_nancheck.o \ @@ -110,6 +112,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_dtp_trans.o \ lapacke_dtr_nancheck.o \ lapacke_dtr_trans.o \ + lapacke_dtz_nancheck.o \ + lapacke_dtz_trans.o \ lapacke_lsame.o \ lapacke_sgb_nancheck.o \ lapacke_sgb_trans.o \ @@ -145,6 +149,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_stp_trans.o \ lapacke_str_nancheck.o \ lapacke_str_trans.o \ + lapacke_stz_nancheck.o \ + lapacke_stz_trans.o \ lapacke_xerbla.o \ lapacke_zgb_nancheck.o \ lapacke_zgb_trans.o \ @@ -184,6 +190,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_ztp_trans.o \ lapacke_ztr_nancheck.o \ lapacke_ztr_trans.o \ + lapacke_ztz_nancheck.o \ + lapacke_ztz_trans.o \ lapacke_make_complex_float.o \ lapacke_make_complex_double.o diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c new file mode 100644 index 000000000..bea956781 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c @@ -0,0 +1,144 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *a, + lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_cge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c new file mode 100644 index 000000000..8910aee7d --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *in, lapack_int ldin, + lapack_complex_float *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_cge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + return LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c new file mode 100644 index 000000000..cd2ae6731 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c @@ -0,0 +1,143 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *a, lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_dge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda ) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c new file mode 100644 index 000000000..80d94ead9 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *in, lapack_int ldin, + double *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_dge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + return LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c new file mode 100644 index 000000000..7d7c30f96 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c @@ -0,0 +1,143 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *a, lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_sge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_str_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c new file mode 100644 index 000000000..793f3833d --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *in, lapack_int ldin, + float *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_sge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + return LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c new file mode 100644 index 000000000..481fa4c03 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c @@ -0,0 +1,144 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *a, + lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_zge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c new file mode 100644 index 000000000..881052331 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *in, lapack_int ldin, + lapack_complex_double *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_zge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + return LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} From 90d7451df55231b6ec656f3baf7a4e288697e7dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 15:10:00 +0100 Subject: [PATCH 075/154] Add NaN check functions for trapezoidal matrices (Reference-LAPACK PR738+742) --- cmake/lapacke.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 340ea6d6c..c740eceb4 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -2481,6 +2481,8 @@ set(Utils_SRC lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c + lapacke_ctz_nancheck.c lapacke_ctz_trans.c lapacke_dtz_nancheck.c lapacke_dtz_trans.c + lapacke_stz_nancheck.c lapacke_stz_trans.c lapacke_ztz_nancheck.c lapacke_ztz_trans.c ) set(LAPACKE_REL_SRC "") From 645633e321e09fafe271844011810a0fabd4f154 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 17:48:02 +0100 Subject: [PATCH 076/154] Fix leading dimension check of eigen-/Schur vectors (Reference-LAPACK PR 665) --- lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_cgges_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dgges_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c index 081f5b129..af6a247ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c @@ -61,12 +61,12 @@ lapack_int LAPACKE_cgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -9; LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c index 2257c64df..632ddd661 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c @@ -65,12 +65,12 @@ lapack_int LAPACKE_cgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -13; LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c index ff74939a3..be0b8347f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c @@ -72,12 +72,12 @@ lapack_int LAPACKE_cgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -15; LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c index 7edb1fa9b..311fe6e0a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c @@ -76,12 +76,12 @@ lapack_int LAPACKE_cggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c index c4de72394..424f5d176 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c @@ -59,12 +59,12 @@ lapack_int LAPACKE_dgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -10; LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c index 9efb49ed3..7f4c6881d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c @@ -63,12 +63,12 @@ lapack_int LAPACKE_dgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -14; LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c index effa1b3f5..bc6bf47d9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c @@ -70,12 +70,12 @@ lapack_int LAPACKE_dgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c index ace40a32a..bde1321d7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c @@ -73,12 +73,12 @@ lapack_int LAPACKE_dggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -19; LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; From 2226a82f2e6e9d4891247941fe946b33415c5d0f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 17:50:49 +0100 Subject: [PATCH 077/154] Fix leading dimension check of eigen-/Schur vectors (Reference-LAPACK PR 665) --- lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_sgges_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zgges_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c index 0f5a8e004..af6dbedf0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c @@ -59,12 +59,12 @@ lapack_int LAPACKE_sgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -10; LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c index d05ea16e9..67f4982bf 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c @@ -63,12 +63,12 @@ lapack_int LAPACKE_sgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -14; LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c index a3b09de30..1bd3eacf4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c @@ -70,12 +70,12 @@ lapack_int LAPACKE_sgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c index d3927e525..b1fbe1902 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c @@ -73,12 +73,12 @@ lapack_int LAPACKE_sggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -19; LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c index 9393f825a..445b9dc1c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c @@ -61,12 +61,12 @@ lapack_int LAPACKE_zgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -9; LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c index e34112c09..29dbf06f0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c @@ -65,12 +65,12 @@ lapack_int LAPACKE_zgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -13; LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c index 2694c6530..13e2455c6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c @@ -72,12 +72,12 @@ lapack_int LAPACKE_zgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -15; LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c index f9f1ccee1..fe99949b7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c @@ -76,12 +76,12 @@ lapack_int LAPACKE_zggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; From 0c2aa0bed7d51af06f2ef2ab24779722473bce9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 20:29:08 +0100 Subject: [PATCH 078/154] Fix implicit conversions and unused variables (Reference-LAPACK PR 703) --- lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c | 1 - 4 files changed, 4 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 8406635e9..05ff8d57f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c index 4e1b87681..4a0d427b3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c index 0b6406dec..627d2406c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c index 528b94a47..1d318e571 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); return -1; From a485e4f5156ad08dad26cdc20960b379fbc6a919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 20:30:06 +0100 Subject: [PATCH 079/154] Fix implicit conversions and unused variables (Reference-LAPACK PR 703) --- lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c index 1c027f862..a174fcaf0 100644 --- a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c @@ -49,11 +49,9 @@ LAPACKE_dgels (row-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ /* Calling DGELS using row-major layout */ @@ -66,8 +64,8 @@ int main (int argc, const char * argv[]) { /* Locals */ - double A[5][3] = {1,1,1,2,3,4,3,5,2,4,2,5,5,4,3}; - double b[5][2] = {-10,-3,12,14,14,12,16,16,18,16}; + double A[5][3] = {{1,1,1},{2,3,4},{3,5,2},{4,2,5},{5,4,3}}; + double b[5][2] = {{-10,-3},{12,14},{14,12},{16,16},{18,16}}; lapack_int info,m,n,lda,ldb,nrhs; /* Initialization */ From c99d27ae451a8c3e1ad46f3ff2fc2661ccb94896 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 20:33:20 +0100 Subject: [PATCH 080/154] Fix implicit conversions and unused variables (Reference-LAPACK PR 703) --- lapack-netlib/TESTING/EIG/cdrvsg.f | 4 ++-- lapack-netlib/TESTING/EIG/cget37.f | 2 +- lapack-netlib/TESTING/EIG/ddrvsg.f | 4 ++-- lapack-netlib/TESTING/EIG/sdrvsg.f | 4 ++-- lapack-netlib/TESTING/EIG/zdrvsg.f | 4 ++-- lapack-netlib/TESTING/EIG/zget37.f | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f index a93933a27..d15b39d01 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg.f @@ -663,8 +663,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*SLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*SLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/cget37.f b/lapack-netlib/TESTING/EIG/cget37.f index c2a6589f3..44d4580d6 100644 --- a/lapack-netlib/TESTING/EIG/cget37.f +++ b/lapack-netlib/TESTING/EIG/cget37.f @@ -265,7 +265,7 @@ 100 CONTINUE WSRT( KMIN ) = WSRT( I ) WSRT( I ) = VMIN - VCMIN = WTMP( I ) + VCMIN = REAL( WTMP( I ) ) WTMP( I ) = W( KMIN ) WTMP( KMIN ) = VCMIN VMIN = STMP( KMIN ) diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f index 0b49c8404..2e9d3c643 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg.f @@ -645,8 +645,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*DLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*DLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f index 4a57223c8..877579bcd 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg.f @@ -645,8 +645,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*SLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*SLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f index 336514a3f..71f1d6371 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg.f @@ -663,8 +663,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*DLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*DLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/zget37.f b/lapack-netlib/TESTING/EIG/zget37.f index 63680e855..5013fbdd9 100644 --- a/lapack-netlib/TESTING/EIG/zget37.f +++ b/lapack-netlib/TESTING/EIG/zget37.f @@ -265,7 +265,7 @@ 100 CONTINUE WSRT( KMIN ) = WSRT( I ) WSRT( I ) = VMIN - VCMIN = WTMP( I ) + VCMIN = DBLE( WTMP( I ) ) WTMP( I ) = W( KMIN ) WTMP( KMIN ) = VCMIN VMIN = STMP( KMIN ) From fdb012ceed9ec69d900f9c5117e7be4263b6a947 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 20:37:18 +0100 Subject: [PATCH 081/154] Fix implicit conversions and unused variables (Reference-LAPACK PR 703) --- lapack-netlib/TESTING/LIN/cchkpt.f | 6 +-- lapack-netlib/TESTING/LIN/cchktr.f | 54 ++++++++++++++++++++++----- lapack-netlib/TESTING/LIN/cdrvgt.f | 8 ++-- lapack-netlib/TESTING/LIN/clattp.f | 6 +-- lapack-netlib/TESTING/LIN/cpbt01.f | 5 ++- lapack-netlib/TESTING/LIN/cpot01.f | 4 +- lapack-netlib/TESTING/LIN/cppt01.f | 2 +- lapack-netlib/TESTING/LIN/cpst01.f | 2 +- lapack-netlib/TESTING/LIN/zchkpt.f | 6 +-- lapack-netlib/TESTING/LIN/zchktr.f | 60 +++++++++++++++++++++++------- lapack-netlib/TESTING/LIN/zdrvgt.f | 8 ++-- lapack-netlib/TESTING/LIN/zdrvpt.f | 12 +++--- lapack-netlib/TESTING/LIN/zlattp.f | 6 +-- lapack-netlib/TESTING/LIN/zpbt01.f | 5 ++- lapack-netlib/TESTING/LIN/zpot01.f | 4 +- lapack-netlib/TESTING/LIN/zppt01.f | 2 +- lapack-netlib/TESTING/LIN/zpst01.f | 2 +- 17 files changed, 131 insertions(+), 61 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cchkpt.f b/lapack-netlib/TESTING/LIN/cchkpt.f index 2ec802064..7dc367eeb 100644 --- a/lapack-netlib/TESTING/LIN/cchkpt.f +++ b/lapack-netlib/TESTING/LIN/cchkpt.f @@ -319,15 +319,15 @@ * elements. * IF( IZERO.EQ.1 ) THEN - D( 1 ) = Z( 2 ) + D( 1 ) = REAL( Z( 2 ) ) IF( N.GT.1 ) $ E( 1 ) = Z( 3 ) ELSE IF( IZERO.EQ.N ) THEN E( N-1 ) = Z( 1 ) - D( N ) = Z( 2 ) + D( N ) = REAL( Z( 2 ) ) ELSE E( IZERO-1 ) = Z( 1 ) - D( IZERO ) = Z( 2 ) + D( IZERO ) = REAL( Z( 2 ) ) E( IZERO ) = Z( 3 ) END IF END IF diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index ce1ecf761..c55b07643 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS +*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -210,9 +210,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04, - $ CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS, - $ CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI, - $ CTRTRS, XLAENV + $ CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR, + $ CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03, + $ CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Complex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = REAL( A( 1 ) ) * CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B. +* + SRNAMT = 'CLATRS3' + CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B, 1 ) + CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from CLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL CSSCAL( N, BIGNUM, X, 1 ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cdrvgt.f b/lapack-netlib/TESTING/LIN/cdrvgt.f index 8d43f640f..acfbbcfa1 100644 --- a/lapack-netlib/TESTING/LIN/cdrvgt.f +++ b/lapack-netlib/TESTING/LIN/cdrvgt.f @@ -307,16 +307,16 @@ IZERO = 0 ELSE IF( IMAT.EQ.8 ) THEN IZERO = 1 - Z( 2 ) = A( N ) + Z( 2 ) = REAL( A( N ) ) A( N ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = A( 1 ) + Z( 3 ) = REAL( A( 1 ) ) A( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N - Z( 1 ) = A( 3*N-2 ) - Z( 2 ) = A( 2*N-1 ) + Z( 1 ) = REAL( A( 3*N-2 ) ) + Z( 2 ) = REAL( A( 2*N-1 ) ) A( 3*N-2 ) = ZERO A( 2*N-1 ) = ZERO ELSE diff --git a/lapack-netlib/TESTING/LIN/clattp.f b/lapack-netlib/TESTING/LIN/clattp.f index 82f0585df..a47a252ad 100644 --- a/lapack-netlib/TESTING/LIN/clattp.f +++ b/lapack-netlib/TESTING/LIN/clattp.f @@ -336,7 +336,7 @@ WORK( J+1 ) = PLUS2 WORK( N+J+1 ) = ZERO PLUS1 = STAR1 / PLUS2 - REXP = CLARND( 2, ISEED ) + REXP = REAL( CLARND( 2, ISEED ) ) IF( REXP.LT.ZERO ) THEN STAR1 = -SFAC**( ONE-REXP )*CLARND( 5, ISEED ) ELSE @@ -790,7 +790,7 @@ DO 460 J = 1, N / 2 JL = JJ DO 450 I = J, N - J - T = AP( JR-I+J ) + T = REAL( AP( JR-I+J ) ) AP( JR-I+J ) = AP( JL ) AP( JL ) = T JL = JL + I @@ -804,7 +804,7 @@ DO 480 J = 1, N / 2 JR = JJ DO 470 I = J, N - J - T = AP( JL+I-J ) + T = REAL( AP( JL+I-J ) ) AP( JL+I-J ) = AP( JR ) AP( JR ) = T JR = JR - I diff --git a/lapack-netlib/TESTING/LIN/cpbt01.f b/lapack-netlib/TESTING/LIN/cpbt01.f index 33c80666d..6145a1875 100644 --- a/lapack-netlib/TESTING/LIN/cpbt01.f +++ b/lapack-netlib/TESTING/LIN/cpbt01.f @@ -201,7 +201,8 @@ * * Compute the (K,K) element of the result. * - AKK = CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) + AKK = REAL( + $ CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) ) AFAC( KD+1, K ) = AKK * * Compute the rest of column K. @@ -228,7 +229,7 @@ * * Scale column K by the diagonal element. * - AKK = AFAC( 1, K ) + AKK = REAL( AFAC( 1, K ) ) CALL CSSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 ) * 40 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cpot01.f b/lapack-netlib/TESTING/LIN/cpot01.f index 00e195dd6..fbcf65086 100644 --- a/lapack-netlib/TESTING/LIN/cpot01.f +++ b/lapack-netlib/TESTING/LIN/cpot01.f @@ -176,7 +176,7 @@ * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. @@ -224,7 +224,7 @@ 70 CONTINUE END IF * -* Compute norm( L*U - A ) / ( N * norm(A) * EPS ) +* Compute norm(L*U - A) / ( N * norm(A) * EPS ) * RESID = CLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK ) * diff --git a/lapack-netlib/TESTING/LIN/cppt01.f b/lapack-netlib/TESTING/LIN/cppt01.f index 3a761a4c7..f865ec779 100644 --- a/lapack-netlib/TESTING/LIN/cppt01.f +++ b/lapack-netlib/TESTING/LIN/cppt01.f @@ -178,7 +178,7 @@ * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) + TR = REAL( CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) ) AFAC( KC+K-1 ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/cpst01.f b/lapack-netlib/TESTING/LIN/cpst01.f index 26da4b394..03d25515d 100644 --- a/lapack-netlib/TESTING/LIN/cpst01.f +++ b/lapack-netlib/TESTING/LIN/cpst01.f @@ -219,7 +219,7 @@ * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/zchkpt.f b/lapack-netlib/TESTING/LIN/zchkpt.f index 80e1690a7..11089d2a1 100644 --- a/lapack-netlib/TESTING/LIN/zchkpt.f +++ b/lapack-netlib/TESTING/LIN/zchkpt.f @@ -319,15 +319,15 @@ * elements. * IF( IZERO.EQ.1 ) THEN - D( 1 ) = Z( 2 ) + D( 1 ) = DBLE( Z( 2 ) ) IF( N.GT.1 ) $ E( 1 ) = Z( 3 ) ELSE IF( IZERO.EQ.N ) THEN E( N-1 ) = Z( 1 ) - D( N ) = Z( 2 ) + D( N ) = DBLE( Z( 2 ) ) ELSE E( IZERO-1 ) = Z( 1 ) - D( IZERO ) = Z( 2 ) + D( IZERO ) = DBLE( Z( 2 ) ) E( IZERO ) = Z( 3 ) END IF END IF diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 0a6f47b1e..275ca2857 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS +*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, DLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -209,10 +209,10 @@ EXTERNAL LSAME, ZLANTR * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR, - $ ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON, - $ ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06, - $ ZTRTRI, ZTRTRS + EXTERNAL ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY, + $ ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS, + $ ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01, + $ ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Zomplex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = DBLE( A( 1 ) ) * CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'ZLATRS3' + CALL ZCOPY( N, X, 1, B, 1 ) + CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) + CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from ZLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL ZDSCAL( N, BIGNUM, X, 1 ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -565,8 +599,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/zdrvgt.f b/lapack-netlib/TESTING/LIN/zdrvgt.f index d055e4bdb..b2e0f66b1 100644 --- a/lapack-netlib/TESTING/LIN/zdrvgt.f +++ b/lapack-netlib/TESTING/LIN/zdrvgt.f @@ -307,16 +307,16 @@ IZERO = 0 ELSE IF( IMAT.EQ.8 ) THEN IZERO = 1 - Z( 2 ) = A( N ) + Z( 2 ) = DBLE( A( N ) ) A( N ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = A( 1 ) + Z( 3 ) = DBLE( A( 1 ) ) A( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N - Z( 1 ) = A( 3*N-2 ) - Z( 2 ) = A( 2*N-1 ) + Z( 1 ) = DBLE( A( 3*N-2 ) ) + Z( 2 ) = DBLE( A( 2*N-1 ) ) A( 3*N-2 ) = ZERO A( 2*N-1 ) = ZERO ELSE diff --git a/lapack-netlib/TESTING/LIN/zdrvpt.f b/lapack-netlib/TESTING/LIN/zdrvpt.f index 14a9f76ba..75f4d5738 100644 --- a/lapack-netlib/TESTING/LIN/zdrvpt.f +++ b/lapack-netlib/TESTING/LIN/zdrvpt.f @@ -266,12 +266,12 @@ * IA = 1 DO 20 I = 1, N - 1 - D( I ) = A( IA ) + D( I ) = DBLE( A( IA ) ) E( I ) = A( IA+1 ) IA = IA + 2 20 CONTINUE IF( N.GT.0 ) - $ D( N ) = A( IA ) + $ D( N ) = DBLE( A( IA ) ) ELSE * * Type 7-12: generate a diagonally dominant matrix with @@ -333,13 +333,13 @@ Z( 2 ) = D( 1 ) D( 1 ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = E( 1 ) + Z( 3 ) = DBLE( E( 1 ) ) E( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N IF( N.GT.1 ) THEN - Z( 1 ) = E( N-1 ) + Z( 1 ) = DBLE( E( N-1 ) ) E( N-1 ) = ZERO END IF Z( 2 ) = D( N ) @@ -347,9 +347,9 @@ ELSE IF( IMAT.EQ.10 ) THEN IZERO = ( N+1 ) / 2 IF( IZERO.GT.1 ) THEN - Z( 1 ) = E( IZERO-1 ) + Z( 1 ) = DBLE( E( IZERO-1 ) ) E( IZERO-1 ) = ZERO - Z( 3 ) = E( IZERO ) + Z( 3 ) = DBLE( E( IZERO ) ) E( IZERO ) = ZERO END IF Z( 2 ) = D( IZERO ) diff --git a/lapack-netlib/TESTING/LIN/zlattp.f b/lapack-netlib/TESTING/LIN/zlattp.f index b728852b5..e05d9299e 100644 --- a/lapack-netlib/TESTING/LIN/zlattp.f +++ b/lapack-netlib/TESTING/LIN/zlattp.f @@ -336,7 +336,7 @@ WORK( J+1 ) = PLUS2 WORK( N+J+1 ) = ZERO PLUS1 = STAR1 / PLUS2 - REXP = ZLARND( 2, ISEED ) + REXP = DBLE( ZLARND( 2, ISEED ) ) IF( REXP.LT.ZERO ) THEN STAR1 = -SFAC**( ONE-REXP )*ZLARND( 5, ISEED ) ELSE @@ -790,7 +790,7 @@ DO 460 J = 1, N / 2 JL = JJ DO 450 I = J, N - J - T = AP( JR-I+J ) + T = DBLE( AP( JR-I+J ) ) AP( JR-I+J ) = AP( JL ) AP( JL ) = T JL = JL + I @@ -804,7 +804,7 @@ DO 480 J = 1, N / 2 JR = JJ DO 470 I = J, N - J - T = AP( JL+I-J ) + T = DBLE( AP( JL+I-J ) ) AP( JL+I-J ) = AP( JR ) AP( JR ) = T JR = JR - I diff --git a/lapack-netlib/TESTING/LIN/zpbt01.f b/lapack-netlib/TESTING/LIN/zpbt01.f index fb7881ac7..1801b66cf 100644 --- a/lapack-netlib/TESTING/LIN/zpbt01.f +++ b/lapack-netlib/TESTING/LIN/zpbt01.f @@ -201,7 +201,8 @@ * * Compute the (K,K) element of the result. * - AKK = ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) + AKK = DBLE( + $ ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) ) AFAC( KD+1, K ) = AKK * * Compute the rest of column K. @@ -228,7 +229,7 @@ * * Scale column K by the diagonal element. * - AKK = AFAC( 1, K ) + AKK = DBLE( AFAC( 1, K ) ) CALL ZDSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 ) * 40 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/zpot01.f b/lapack-netlib/TESTING/LIN/zpot01.f index d71445cd4..de83414c6 100644 --- a/lapack-netlib/TESTING/LIN/zpot01.f +++ b/lapack-netlib/TESTING/LIN/zpot01.f @@ -176,7 +176,7 @@ * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. @@ -224,7 +224,7 @@ 70 CONTINUE END IF * -* Compute norm( L*U - A ) / ( N * norm(A) * EPS ) +* Compute norm(L*U - A) / ( N * norm(A) * EPS ) * RESID = ZLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK ) * diff --git a/lapack-netlib/TESTING/LIN/zppt01.f b/lapack-netlib/TESTING/LIN/zppt01.f index 78ec595af..acaea50d2 100644 --- a/lapack-netlib/TESTING/LIN/zppt01.f +++ b/lapack-netlib/TESTING/LIN/zppt01.f @@ -178,7 +178,7 @@ * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) ) AFAC( KC+K-1 ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/zpst01.f b/lapack-netlib/TESTING/LIN/zpst01.f index 691857219..bed18c514 100644 --- a/lapack-netlib/TESTING/LIN/zpst01.f +++ b/lapack-netlib/TESTING/LIN/zpst01.f @@ -219,7 +219,7 @@ * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. From 8b3f9715ec122dd40a8a2638b92757ac7d8ff7f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 22:48:37 +0100 Subject: [PATCH 082/154] Add macros for 32/64bit integer printf --- lapack-netlib/LAPACKE/include/lapack.h | 100 +++++++++++++++++- .../LAPACKE/include/lapacke_config.h | 18 +++- 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 14695fdc8..b5a276f5a 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,6 +12,7 @@ #include #include +#include /* It seems all current Fortran compilers put strlen at end. * Some historical compilers put strlen after the str argument @@ -80,11 +81,26 @@ extern "C" { /*----------------------------------------------------------------------------*/ #ifndef lapack_int -#define lapack_int int +#if defined(LAPACK_ILP64) +#define lapack_int int64_t +#else +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 +#endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif /* f2c, hence clapack and MacOS Accelerate, returns double instead of float @@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2) ( const lapack_complex_double*, const lapack_complex_double* ); #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME) -lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, +lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, lapack_int lca, lapack_int lcb #ifdef LAPACK_FORTRAN_STRLEN_END , size_t, size_t @@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) +void LAPACK_ctrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, float* scale, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) +void LAPACK_dtrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, double* scale, + lapack_int* iwork, lapack_int const* liwork, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) +void LAPACK_strsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, float* scale, + lapack_int* iwork, lapack_int const* liwork, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) +void LAPACK_ztrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, double* scale, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) void LAPACK_ctrtri_base( char const* uplo, char const* diag, diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 4a7d15760..c64fc4416 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -42,17 +42,29 @@ extern "C" { #include #include +#include #ifndef lapack_int #if defined(LAPACK_ILP64) -#define lapack_int int64_t +#define lapack_int int64_t #else -#define lapack_int int32_t +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 #endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif #ifndef LAPACK_COMPLEX_CUSTOM From 29dc086f38eb0220f301b2f6d9a3dc85e9346dbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 22:50:02 +0100 Subject: [PATCH 083/154] Add macros for 32/64bit integer printf --- lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c | 6 ++---- lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c | 6 ++---- lapack-netlib/LAPACKE/example/lapacke_example_aux.c | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c index c8bdd6e4e..44a470d47 100644 --- a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c @@ -25,11 +25,9 @@ LAPACKE_dgesv (col-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ /* Includes */ #include @@ -94,7 +92,7 @@ int main(int argc, char **argv) { /* Check for the exact singularity */ if( info > 0 ) { printf( "The diagonal element of the triangular factor of A,\n" ); - printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); + printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); printf( "the solution could not be computed.\n" ); exit( 1 ); } diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c index 35bdcbcae..5411ef049 100644 --- a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c @@ -25,11 +25,9 @@ LAPACKE_dgesv (row-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ #include #include @@ -91,7 +89,7 @@ int main(int argc, char **argv) { /* Check for the exact singularity */ if( info > 0 ) { printf( "The diagonal element of the triangular factor of A,\n" ); - printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); + printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); printf( "the solution could not be computed.\n" ); exit( 1 ); } diff --git a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c index 9b72eb620..19fff7905 100644 --- a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c +++ b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c @@ -28,6 +28,6 @@ void print_matrix_colmajor( char* desc, lapack_int m, lapack_int n, double* mat, void print_vector( char* desc, lapack_int n, lapack_int* vec ) { lapack_int j; printf( "\n %s\n", desc ); - for( j = 0; j < n; j++ ) printf( " %6i", vec[j] ); + for( j = 0; j < n; j++ ) printf( " %6" LAPACK_IFMT, vec[j] ); printf( "\n" ); } From 4bc918a791d0b32a3e56b0b072d3d8cb72873a57 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 23:03:31 +0100 Subject: [PATCH 084/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- lapack-netlib/LAPACKE/include/lapack.h | 100 +++++++++++++++++++++++- lapack-netlib/LAPACKE/include/lapacke.h | 74 ++++++++++++++++++ 2 files changed, 171 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 14695fdc8..b5a276f5a 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,6 +12,7 @@ #include #include +#include /* It seems all current Fortran compilers put strlen at end. * Some historical compilers put strlen after the str argument @@ -80,11 +81,26 @@ extern "C" { /*----------------------------------------------------------------------------*/ #ifndef lapack_int -#define lapack_int int +#if defined(LAPACK_ILP64) +#define lapack_int int64_t +#else +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 +#endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif /* f2c, hence clapack and MacOS Accelerate, returns double instead of float @@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2) ( const lapack_complex_double*, const lapack_complex_double* ); #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME) -lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, +lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, lapack_int lca, lapack_int lcb #ifdef LAPACK_FORTRAN_STRLEN_END , size_t, size_t @@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) +void LAPACK_ctrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, float* scale, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) +void LAPACK_dtrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, double* scale, + lapack_int* iwork, lapack_int const* liwork, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) +void LAPACK_strsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, float* scale, + lapack_int* iwork, lapack_int const* liwork, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) +void LAPACK_ztrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, double* scale, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) void LAPACK_ctrtri_base( char const* uplo, char const* diag, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index f6fbfcc33..9998b1504 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n, float LAPACKE_slamch( char cmach ); double LAPACKE_dlamch( char cmach ); +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ); +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ); +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ); +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ); + float LAPACKE_slange( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda ); double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m, @@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ); +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ); +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ); + lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n, @@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z ); float LAPACKE_slamch_work( char cmach ); double LAPACKE_dlamch_work( char cmach ); +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ); +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ); +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ); +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ); + float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* work ); @@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, + float* c, lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ); +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, + double* c, lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ); +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ); +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ); + lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag, From 7eb265326836d4480aa8a29cd93a3edc9b5c3b95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 23:07:10 +0100 Subject: [PATCH 085/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c | 56 ++++++++++++ .../LAPACKE/src/lapacke_ctrsyl3_work.c | 88 +++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c | 68 ++++++++++++++ .../LAPACKE/src/lapacke_dtrsyl3_work.c | 86 ++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_strsyl3.c | 68 ++++++++++++++ .../LAPACKE/src/lapacke_strsyl3_work.c | 86 ++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c | 1 - lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c | 56 ++++++++++++ .../LAPACKE/src/lapacke_ztrsyl3_work.c | 88 +++++++++++++++++++ 12 files changed, 596 insertions(+), 4 deletions(-) create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_strsyl3.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 8406635e9..05ff8d57f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c new file mode 100644 index 000000000..c931aac48 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c new file mode 100644 index 000000000..09c08d92a --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c index 4e1b87681..4a0d427b3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c new file mode 100644 index 000000000..c95a772de --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c new file mode 100644 index 000000000..272c35b38 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, double* c, + lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + double* a_t = NULL; + double* b_t = NULL; + double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c index 0b6406dec..627d2406c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c new file mode 100644 index 000000000..1cfc626c2 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c new file mode 100644 index 000000000..3c50e4a45 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, float* c, + lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + float* a_t = NULL; + float* b_t = NULL; + float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c index 528b94a47..1d318e571 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c new file mode 100644 index 000000000..dbc9bcf9f --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c new file mode 100644 index 000000000..a7ebd5da6 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + return info; +} From 6eb707d94110f800b6b10b78b648fc3e41672d01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 23:10:13 +0100 Subject: [PATCH 086/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- lapack-netlib/SRC/clatrs3.f | 666 +++++++++++++++++++ lapack-netlib/SRC/ctrsyl3.f | 1142 ++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dlarmm.f | 99 +++ lapack-netlib/SRC/dlatrs3.f | 656 ++++++++++++++++++ lapack-netlib/SRC/dtrsyl3.f | 1241 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/ilaenv.f | 15 + lapack-netlib/SRC/slarmm.f | 99 +++ lapack-netlib/SRC/slatrs3.f | 656 ++++++++++++++++++ lapack-netlib/SRC/strsyl3.f | 1244 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zlatrs3.f | 667 +++++++++++++++++++ lapack-netlib/SRC/ztrsyl3.f | 1142 ++++++++++++++++++++++++++++++++ 11 files changed, 7627 insertions(+) create mode 100644 lapack-netlib/SRC/clatrs3.f create mode 100644 lapack-netlib/SRC/ctrsyl3.f create mode 100644 lapack-netlib/SRC/dlarmm.f create mode 100644 lapack-netlib/SRC/dlatrs3.f create mode 100644 lapack-netlib/SRC/dtrsyl3.f create mode 100644 lapack-netlib/SRC/slarmm.f create mode 100644 lapack-netlib/SRC/slatrs3.f create mode 100644 lapack-netlib/SRC/strsyl3.f create mode 100644 lapack-netlib/SRC/zlatrs3.f create mode 100644 lapack-netlib/SRC/ztrsyl3.f diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f new file mode 100644 index 000000000..a902f1ed0 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.f @@ -0,0 +1,666 @@ +*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( LDX, * ) + REAL CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CLATRS, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of CLATRS3 +* + END diff --git a/lapack-netlib/SRC/ctrsyl3.f b/lapack-netlib/SRC/ctrsyl3.f new file mode 100644 index 000000000..586dc0207 --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b CTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> CTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complexSYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + REAL SCALE +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) + REAL SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX CSGN +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL CLANGE, SLAMCH, SLARMM + EXTERNAL CLANGE, ILAENV, LSAME, SLAMCH, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CSSCAL, CGEMM, CLASCL, CTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'CTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) + CSGN = CMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( REAL( C( 1, 1 ) ) ), + $ ABS( AIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( REAL ( C( K, L ) ) ), + $ ABS( AIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL CLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of CTRSYL3 +* + END diff --git a/lapack-netlib/SRC/dlarmm.f b/lapack-netlib/SRC/dlarmm.f new file mode 100644 index 000000000..c36042009 --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f new file mode 100644 index 000000000..b4a98bc78 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE, DLARMM + EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLATRS, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks +* + NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I+KK*LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) +* + CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of DLATRS3 +* + END diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f new file mode 100644 index 000000000..c44ec3808 --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index af2850398..a639e0375 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -469,6 +469,15 @@ ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ), + $ 80 ) + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN @@ -477,6 +486,12 @@ ELSE NB = 64 END IF + ELSE IF( C3.EQ.'TRS' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN IF( C3.EQ.'EBZ' ) THEN diff --git a/lapack-netlib/SRC/slarmm.f b/lapack-netlib/SRC/slarmm.f new file mode 100644 index 000000000..643dd6748 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f new file mode 100644 index 000000000..c3a08e524 --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL SLATRS, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of SLATRS3 +* + END diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f new file mode 100644 index 000000000..28762c2ed --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.f @@ -0,0 +1,1244 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f new file mode 100644 index 000000000..fc1be0517 --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.f @@ -0,0 +1,667 @@ +*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX*16 A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( LDX, * ) + DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE, DLARMM + EXTERNAL ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM +* .. +* .. External Subroutines .. + EXTERNAL ZLATRS, ZDSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1) * NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO + +* +* Reduce local scaling factors +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of ZLATRS3 +* + END diff --git a/lapack-netlib/SRC/ztrsyl3.f b/lapack-netlib/SRC/ztrsyl3.f new file mode 100644 index 000000000..b5a058da4 --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b ZTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> ZTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complex16SYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) + DOUBLE PRECISION SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX*16 CSGN +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLARMM, ZLANGE + EXTERNAL DLAMCH, DLARMM, ILAENV, LSAME, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DIMAG, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'ZTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) + CSGN = DCMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( DBLE( C( 1, 1 ) ) ), + $ ABS( DIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( DBLE ( C( K, L ) ) ), + $ ABS( DIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL ZLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of ZTRSYL3 +* + END From 92174725d90916a1942030d4afa4ef2f6a9e8a0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 23:16:12 +0100 Subject: [PATCH 087/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 11 +- lapack-netlib/TESTING/EIG/Makefile | 24 +- lapack-netlib/TESTING/EIG/cchkec.f | 42 +++- lapack-netlib/TESTING/EIG/cerrec.f | 43 +++- lapack-netlib/TESTING/EIG/csyl01.f | 294 +++++++++++++++++++++++ lapack-netlib/TESTING/EIG/dchkec.f | 46 +++- lapack-netlib/TESTING/EIG/derrec.f | 41 +++- lapack-netlib/TESTING/EIG/dsyl01.f | 288 ++++++++++++++++++++++ lapack-netlib/TESTING/EIG/schkec.f | 46 +++- lapack-netlib/TESTING/EIG/serrec.f | 41 +++- lapack-netlib/TESTING/EIG/ssyl01.f | 288 ++++++++++++++++++++++ lapack-netlib/TESTING/EIG/zchkec.f | 42 +++- lapack-netlib/TESTING/EIG/zerrec.f | 41 +++- lapack-netlib/TESTING/EIG/zsyl01.f | 294 +++++++++++++++++++++++ 14 files changed, 1468 insertions(+), 73 deletions(-) create mode 100644 lapack-netlib/TESTING/EIG/csyl01.f create mode 100644 lapack-netlib/TESTING/EIG/dsyl01.f create mode 100644 lapack-netlib/TESTING/EIG/ssyl01.f create mode 100644 lapack-netlib/TESTING/EIG/zsyl01.f diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 226004a90..3c8d9a8b2 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -40,7 +40,7 @@ set(SEIGTST schkee.F sget54.f sglmts.f sgqrts.f sgrqts.f sgsvts3.f shst01.f slarfy.f slarhs.f slatm4.f slctes.f slctsx.f slsets.f sort01.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f - sstt22.f ssyt21.f ssyt22.f) + sstt22.f ssyl01.f ssyt21.f ssyt22.f) set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f @@ -56,7 +56,7 @@ set(CEIGTST cchkee.F cget54.f cglmts.f cgqrts.f cgrqts.f cgsvts3.f chbt21.f chet21.f chet22.f chpt21.f chst01.f clarfy.f clarhs.f clatm4.f clctes.f clctsx.f clsets.f csbmv.f - csgt01.f cslect.f + csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f @@ -77,7 +77,7 @@ set(DEIGTST dchkee.F dget54.f dglmts.f dgqrts.f dgrqts.f dgsvts3.f dhst01.f dlarfy.f dlarhs.f dlatm4.f dlctes.f dlctsx.f dlsets.f dort01.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f - dstt22.f dsyt21.f dsyt22.f) + dstt22.f dsyl01.f dsyt21.f dsyt22.f) set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f @@ -93,13 +93,12 @@ set(ZEIGTST zchkee.F zget54.f zglmts.f zgqrts.f zgrqts.f zgsvts3.f zhbt21.f zhet21.f zhet22.f zhpt21.f zhst01.f zlarfy.f zlarhs.f zlatm4.f zlctes.f zlctsx.f zlsets.f zsbmv.f - zsgt01.f zslect.f + zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) -#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index bccfccf95..e40358663 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -62,7 +62,7 @@ SEIGTST = schkee.o \ sget54.o sglmts.o sgqrts.o sgrqts.o sgsvts3.o \ shst01.o slarfy.o slarhs.o slatm4.o slctes.o slctsx.o slsets.o sort01.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ - sstt22.o ssyt21.o ssyt22.o + sstt22.o ssyl01.o ssyt21.o ssyt22.o CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ @@ -78,7 +78,7 @@ CEIGTST = cchkee.o \ cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts3.o \ chbt21.o chet21.o chet22.o chpt21.o chst01.o \ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ - csgt01.o cslect.o \ + csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ @@ -99,7 +99,7 @@ DEIGTST = dchkee.o \ dget54.o dglmts.o dgqrts.o dgrqts.o dgsvts3.o \ dhst01.o dlarfy.o dlarhs.o dlatm4.o dlctes.o dlctsx.o dlsets.o dort01.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ - dstt22.o dsyt21.o dsyt22.o + dstt22.o dsyl01.o dsyt21.o dsyt22.o ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ @@ -115,7 +115,7 @@ ZEIGTST = zchkee.o \ zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts3.o \ zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ - zsgt01.o zslect.o \ + zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o .PHONY: all @@ -127,17 +127,17 @@ complex: xeigtstc double: xeigtstd complex16: xeigtstz -xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(AEIGTST): $(FRC) $(SCIGTST): $(FRC) diff --git a/lapack-netlib/TESTING/EIG/cchkec.f b/lapack-netlib/TESTING/EIG/cchkec.f index 6727a0954..c892b0a54 100644 --- a/lapack-netlib/TESTING/EIG/cchkec.f +++ b/lapack-netlib/TESTING/EIG/cchkec.f @@ -23,7 +23,7 @@ *> \verbatim *> *> CCHKEC tests eigen- condition estimation routines -*> CTRSYL, CTREXC, CTRSNA, CTRSEN +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA, CTRSEN *> *> In all cases, the routine runs through a fixed set of numerical *> examples, subjects them to various tests, and compares the test @@ -88,17 +88,17 @@ * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - REAL EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + REAL EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38 + EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38, CSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -120,10 +120,24 @@ $ CALL CERREC( PATH, NOUT ) * OK = .TRUE. - CALL CGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL CGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL CSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL CGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -169,6 +183,12 @@ $ / ' Safe minimum (SFMIN) = ', E16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9972 FORMAT( 'CTRSYL and CTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in CTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in CTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) RETURN * * End of CCHKEC diff --git a/lapack-netlib/TESTING/EIG/cerrec.f b/lapack-netlib/TESTING/EIG/cerrec.f index 650ab2b6e..6e2e1d38a 100644 --- a/lapack-netlib/TESTING/EIG/cerrec.f +++ b/lapack-netlib/TESTING/EIG/cerrec.f @@ -23,7 +23,7 @@ *> *> CERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> CTRSYL, CTREXC, CTRSNA and CTRSEN. +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA and CTRSEN. *> \endverbatim * * Arguments: @@ -77,12 +77,12 @@ * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - REAL RW( LW ), S( NMAX ), SEP( NMAX ) + REAL RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL + EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL, CTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'CTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test CTRSYL3 +* + SRNAMT = 'CTRSYL3' + INFOT = 1 + CALL CTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test CTREXC * SRNAMT = 'CTREXC' diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f new file mode 100644 index 000000000..e21f1a7a0 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -0,0 +1,294 @@ +*> \brief \b CSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CSYL01 tests CTRSYL and CTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements CGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual CTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual CTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times CTRSYL3 and CTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of CTRSYL +*> RMAX(2) = Value of the largest test ratio of CTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times CTRSYL where INFO is nonzero +*> NINFO(2) = No. of times CTRSYL3 where INFO is nonzero +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX RMUL +* .. +* .. Local Arrays .. + COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MIN( MAXM, MAXN ) ) + REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, CLANGE + EXTERNAL SISNAN, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CLATMR, CLACPY, CGEMM, CTRSYL, CTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.5E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 23 + KLA = 0 + KUA = M - 1 + CALL CLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = CLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 29 + KLB = 0 + KUB = N - 1 + CALL CLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = CLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL CLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL CTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of CSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/dchkec.f b/lapack-netlib/TESTING/EIG/dchkec.f index 854961884..c4451a627 100644 --- a/lapack-netlib/TESTING/EIG/dchkec.f +++ b/lapack-netlib/TESTING/EIG/dchkec.f @@ -90,21 +90,23 @@ LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC DOUBLE PRECISION EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL DERREC, DGET31, DGET32, DGET33, DGET34, DGET35, - $ DGET36, DGET37, DGET38, DGET39, DGET40 + $ DGET36, DGET37, DGET38, DGET39, DGET40, DSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -153,10 +155,24 @@ WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL DGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL DGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL DSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL DGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in DTGEXC: RMAX =', D12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'DTRSYL and DTRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in DTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in DTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of DCHKEC * diff --git a/lapack-netlib/TESTING/EIG/derrec.f b/lapack-netlib/TESTING/EIG/derrec.f index d5863ad42..f11f48887 100644 --- a/lapack-netlib/TESTING/EIG/derrec.f +++ b/lapack-netlib/TESTING/EIG/derrec.f @@ -23,7 +23,7 @@ *> *> DERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> DTRSYL, DTREXC, DTRSNA and DTRSEN. +*> DTRSYL, DTRSYL3, DTREXC, DTRSNA and DTRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL + EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL, DTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'DTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test DTRSYL3 +* + SRNAMT = 'DTRSYL3' + INFOT = 1 + CALL DTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test DTREXC * SRNAMT = 'DTREXC' diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f new file mode 100644 index 000000000..782d2cd42 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dsyl01.f @@ -0,0 +1,288 @@ +*> \brief \b DSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYL01 tests DTRSYL and DTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements DGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual DTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual DTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times DTRSYL3 and DTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION, dimension (2) +*> RMAX(1) = Value of the largest test ratio of DTRSYL +*> RMAX(2) = Value of the largest test ratio of DTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times DTRSYL returns an expected INFO +*> NINFO(2) = No. of times DTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 245, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 126 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DLATMR, DLACPY, DGEMM, DTRSYL, DTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.000001D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL DLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = DLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL DLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = DLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL DLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL DTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of DSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/schkec.f b/lapack-netlib/TESTING/EIG/schkec.f index e6123e1ad..59abb2466 100644 --- a/lapack-netlib/TESTING/EIG/schkec.f +++ b/lapack-netlib/TESTING/EIG/schkec.f @@ -90,21 +90,23 @@ LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC REAL EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL SERREC, SGET31, SGET32, SGET33, SGET34, SGET35, - $ SGET36, SGET37, SGET38, SGET39, SGET40 + $ SGET36, SGET37, SGET38, SGET39, SGET40, SSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -153,10 +155,24 @@ WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL SGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL SGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL SSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL SGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in STGEXC: RMAX =', E12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'STRSYL and STRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in STRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in STRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of SCHKEC * diff --git a/lapack-netlib/TESTING/EIG/serrec.f b/lapack-netlib/TESTING/EIG/serrec.f index 249f0e642..9a7ceb362 100644 --- a/lapack-netlib/TESTING/EIG/serrec.f +++ b/lapack-netlib/TESTING/EIG/serrec.f @@ -23,7 +23,7 @@ *> *> SERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> STRSYL, STREXC, STRSNA and STRSEN. +*> STRSYL, STRSYL3, STREXC, STRSNA and STRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL + EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL, STRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'STRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test STRSYL3 +* + SRNAMT = 'STRSYL3' + INFOT = 1 + CALL STRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test STREXC * SRNAMT = 'STREXC' diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f new file mode 100644 index 000000000..22d089dc8 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/ssyl01.f @@ -0,0 +1,288 @@ +*> \brief \b SSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYL01 tests STRSYL and STRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements SGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual STRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual STRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times STRSYL3 and STRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is REAL, dimension (2) +*> RMAX(1) = Value of the largest test ratio of STRSYL +*> RMAX(2) = Value of the largest test ratio of STRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times STRSYL returns an expected INFO +*> NINFO(2) = No. of times STRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + REAL A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 54 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, SLANGE + EXTERNAL SISNAN, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SLATMR, SLACPY, SGEMM, STRSYL, STRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.05E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL SLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = SLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL SLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = SLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL SLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL STRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ C, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, C, MAXM ) + RES1 = SLANGE( 'M', M, N, C, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL STRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = SLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of SSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/zchkec.f b/lapack-netlib/TESTING/EIG/zchkec.f index 1e1c29e0d..62a76d357 100644 --- a/lapack-netlib/TESTING/EIG/zchkec.f +++ b/lapack-netlib/TESTING/EIG/zchkec.f @@ -88,17 +88,17 @@ * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - DOUBLE PRECISION EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + DOUBLE PRECISION EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38 + EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38, ZSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -120,10 +120,24 @@ $ CALL ZERREC( PATH, NOUT ) * OK = .TRUE. - CALL ZGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL ZGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL ZSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL ZGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -148,7 +162,7 @@ WRITE( NOUT, FMT = 9996 )RTRSEN, LTRSEN, NTRSEN, KTRSEN END IF * - NTESTS = KTRSYL + KTREXC + KTRSNA + KTRSEN + NTESTS = KTRSYL + KTRSYL3 + KTREXC + KTRSNA + KTRSEN IF( OK ) $ WRITE( NOUT, FMT = 9995 )PATH, NTESTS * @@ -169,6 +183,12 @@ $ / ' Safe minimum (SFMIN) = ', D16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9970 FORMAT( 'Error in ZTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9971 FORMAT( 'Error in ZTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9972 FORMAT( 'ZTRSYL and ZTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') RETURN * * End of ZCHKEC diff --git a/lapack-netlib/TESTING/EIG/zerrec.f b/lapack-netlib/TESTING/EIG/zerrec.f index dc6129da9..e1938f57d 100644 --- a/lapack-netlib/TESTING/EIG/zerrec.f +++ b/lapack-netlib/TESTING/EIG/zerrec.f @@ -23,7 +23,7 @@ *> *> ZERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> ZTRSYL, ZTREXC, ZTRSNA and ZTRSEN. +*> ZTRSYL, ZTRSYL3, ZTREXC, ZTRSNA and ZTRSEN. *> \endverbatim * * Arguments: @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ) + DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX*16 A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. @@ -141,6 +141,43 @@ CALL CHKXER( 'ZTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test ZTRSYL3 +* + SRNAMT = 'ZTRSYL3' + INFOT = 1 + CALL ZTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test ZTREXC * SRNAMT = 'ZTREXC' diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f new file mode 100644 index 000000000..1e8619a34 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -0,0 +1,294 @@ +*> \brief \b ZSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZSYL01 tests ZTRSYL and ZTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements ZGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual ZTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual ZTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times ZTRSYL3 and ZTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of ZTRSYL +*> RMAX(2) = Value of the largest test ratio of ZTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times ZTRSYL returns an expected INFO +*> NINFO(2) = No. of times ZTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D+0 ) ) + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 185, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX*16 RMUL +* .. +* .. Local Arrays .. + COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MIN( MAXM, MAXN ) ) + DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL DISNAN, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZLATMR, ZLACPY, ZGEMM, ZTRSYL, ZTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.05D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 51 + KLA = 0 + KUA = M - 1 + CALL ZLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = ZLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL ZLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = ZLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL ZLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL ZTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of ZSYL01 +* + END From 13f3bbece1786da4236e128c29bfeeedfed20869 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Nov 2022 23:18:09 +0100 Subject: [PATCH 088/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- lapack-netlib/TESTING/LIN/cchktr.f | 54 ++++++++++++++++++++++----- lapack-netlib/TESTING/LIN/cerrtr.f | 47 +++++++++++++++++++++-- lapack-netlib/TESTING/LIN/dchktr.f | 56 ++++++++++++++++++++++------ lapack-netlib/TESTING/LIN/derrtr.f | 47 +++++++++++++++++++++-- lapack-netlib/TESTING/LIN/schktr.f | 57 ++++++++++++++++++++++------ lapack-netlib/TESTING/LIN/serrtr.f | 47 +++++++++++++++++++++-- lapack-netlib/TESTING/LIN/zchktr.f | 60 +++++++++++++++++++++++------- lapack-netlib/TESTING/LIN/zerrtr.f | 47 +++++++++++++++++++++-- 8 files changed, 358 insertions(+), 57 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index ce1ecf761..c55b07643 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS +*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -210,9 +210,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04, - $ CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS, - $ CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI, - $ CTRTRS, XLAENV + $ CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR, + $ CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03, + $ CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Complex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = REAL( A( 1 ) ) * CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B. +* + SRNAMT = 'CLATRS3' + CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B, 1 ) + CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from CLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL CSSCAL( N, BIGNUM, X, 1 ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index db65edd88..9ba784f62 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -82,9 +82,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, CTBCON, - $ CTBRFS, CTBTRS, CTPCON, CTPRFS, CTPTRI, CTPTRS, - $ CTRCON, CTRRFS, CTRTI2, CTRTRI, CTRTRS + EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, + $ CLATRS3, CTBCON, CTBRFS, CTBTRS, CTPCON, + $ CTPRFS, CTPTRI, CTPTRS, CTRCON, CTRRFS, CTRTI2, + $ CTRTRI, CTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ CALL CLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'CLATRS', INFOT, NOUT, LERR, OK ) * +* CLATRS3 +* + SRNAMT = 'CLATRS3' + INFOT = 1 + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/dchktr.f b/lapack-netlib/TESTING/LIN/dchktr.f index a4a1150c0..57e87326b 100644 --- a/lapack-netlib/TESTING/LIN/dchktr.f +++ b/lapack-netlib/TESTING/LIN/dchktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS +*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -198,13 +198,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DLAMCH, DUMMY, RCOND, + $ RCONDC, RCONDI, RCONDO, RES, SCALE * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DCOPY, DERRTR, DGET04, - $ DLACPY, DLARHS, DLATRS, DLATTR, DTRCON, DTRRFS, - $ DTRT01, DTRT02, DTRT03, DTRT05, DTRT06, DTRTRI, - $ DTRTRS, XLAENV + $ DLACPY, DLAMCH, DSCAL, DLARHS, DLATRS, DLATRS3, + $ DLATTR, DTRCON, DTRRFS, DTRT01, DTRT02, DTRT03, + $ DTRT05, DTRT06, DTRTRI, DTRTRS, XLAENV * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ * PATH( 1: 1 ) = 'Double precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'DLATRS3' + CALL DCOPY( N, X, 1, B, 1 ) + CALL DCOPY( N, X, 1, B( N+1 ), 1 ) + CALL DSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL DLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from DLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL DSCAL( N, BIGNUM, X, 1 ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +583,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'DLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +603,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index a667f0d2b..d0580497d 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -83,9 +83,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, DTBCON, - $ DTBRFS, DTBTRS, DTPCON, DTPRFS, DTPTRI, DTPTRS, - $ DTRCON, DTRRFS, DTRTI2, DTRTRI, DTRTRS + EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, + $ DLATRS3, DTBCON, DTBRFS, DTBTRS, DTPCON, + $ DTPRFS, DTPTRI, DTPTRS, DTRCON, DTRRFS, + $ DTRTI2, DTRTRI, DTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ INFOT = 7 CALL DLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'DLATRS', INFOT, NOUT, LERR, OK ) +* +* DLATRS3 +* + SRNAMT = 'DLATRS3' + INFOT = 1 + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f index 66fa0bee7..5aeb1ce88 100644 --- a/lapack-netlib/TESTING/LIN/schktr.f +++ b/lapack-netlib/TESTING/LIN/schktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS +*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -198,13 +198,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, SCOPY, SERRTR, SGET04, - $ SLACPY, SLARHS, SLATRS, SLATTR, STRCON, STRRFS, - $ STRT01, STRT02, STRT03, STRT05, STRT06, STRTRI, - $ STRTRS, XLAENV + $ SLACPY, SLARHS, SLATRS, SLATRS3, SLATTR, SSCAL, + $ STRCON, STRRFS, STRT01, STRT02, STRT03, STRT05, + $ STRT06, STRTRI, STRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ * PATH( 1: 1 ) = 'Single precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,33 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'SLATRS3' + CALL SCOPY( N, X, 1, B, 1 ) + CALL SCOPY( N, X, 1, B( N+1 ), 1 ) + CALL SSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL SLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from SLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'Y', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) +* + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3 ( 1 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL SSCAL( N, BIGNUM, X, 1 ) + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +584,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'SLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +604,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index f0d0a0ef2..af1ce0a8e 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -83,9 +83,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, STBCON, - $ STBRFS, STBTRS, STPCON, STPRFS, STPTRI, STPTRS, - $ STRCON, STRRFS, STRTI2, STRTRI, STRTRS + EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, + $ SLATRS3, STBCON, STBRFS, STBTRS, STPCON, + $ STPRFS, STPTRI, STPTRS, STRCON, STRRFS, STRTI2, + $ STRTRI, STRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ INFOT = 7 CALL SLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'SLATRS', INFOT, NOUT, LERR, OK ) +* +* SLATRS3 +* + SRNAMT = 'SLATRS3' + INFOT = 1 + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 0a6f47b1e..275ca2857 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS +*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, DLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -209,10 +209,10 @@ EXTERNAL LSAME, ZLANTR * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR, - $ ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON, - $ ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06, - $ ZTRTRI, ZTRTRS + EXTERNAL ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY, + $ ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS, + $ ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01, + $ ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Zomplex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = DBLE( A( 1 ) ) * CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'ZLATRS3' + CALL ZCOPY( N, X, 1, B, 1 ) + CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) + CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from ZLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL ZDSCAL( N, BIGNUM, X, 1 ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -565,8 +599,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 098040ace..211b92154 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -82,9 +82,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, ZTBCON, - $ ZTBRFS, ZTBTRS, ZTPCON, ZTPRFS, ZTPTRI, ZTPTRS, - $ ZTRCON, ZTRRFS, ZTRTI2, ZTRTRI, ZTRTRS + EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, + $ ZLATRS3, ZTBCON, ZTBRFS, ZTBTRS, ZTPCON, + $ ZTPRFS, ZTPTRI, ZTPTRS, ZTRCON, ZTRRFS, ZTRTI2, + $ ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ CALL ZLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'ZLATRS', INFOT, NOUT, LERR, OK ) * +* ZLATRS3 +* + SRNAMT = 'ZLATRS3' + INFOT = 1 + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN From fb42a0cf8b4373de42aab691ea4b939185c8bfa3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Nov 2022 14:06:50 +0100 Subject: [PATCH 089/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- cmake/lapack.cmake | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 3b221d420..4c8efa11f 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -123,7 +123,8 @@ set(SLASRC ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f + slarmm.f slatrs3.f strsyl3.f) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -221,7 +222,8 @@ set(CLASRC cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cungtsqr_row.f cunhr_col.f ) + cungtsqr.f cungtsqr_row.f cunhr_col.f + clatrs3.f ctrsyl3.f ) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -313,7 +315,8 @@ set(DLASRC dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f + dlarmm.f dlatrs3.f dtrsyl3.f) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -415,7 +418,8 @@ set(ZLASRC zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zungtsqr_row.f zunhr_col.f) + zungtsqr.f zungtsqr_row.f zunhr_col.f + zlarts3.f ztrsyl3.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f From bb652f65a37a6d0bb973074136e5742b61d23cdb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Nov 2022 16:35:13 +0100 Subject: [PATCH 090/154] Typo fix --- cmake/lapack.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 4c8efa11f..a78a89f1a 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -419,7 +419,7 @@ set(ZLASRC zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f zungtsqr.f zungtsqr_row.f zunhr_col.f - zlarts3.f ztrsyl3.f) + zlatrs3.f ztrsyl3.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f From 52c2a0397be870f7158009a30a029222faa12f56 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Nov 2022 17:13:08 +0100 Subject: [PATCH 091/154] Restore OpenBLAS modifications to link line --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 3c8d9a8b2..d252c7fa9 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -98,7 +98,8 @@ set(ZEIGTST zchkee.F macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) +#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) From 2592853fc72ec3358ca0f30f72326d831df515e9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Nov 2022 21:47:37 +0100 Subject: [PATCH 092/154] Restore OpenBLAS-specific changes --- lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index e40358663..942ae6982 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -127,17 +127,17 @@ complex: xeigtstc double: xeigtstd complex16: xeigtstz -xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(AEIGTST): $(FRC) $(SCIGTST): $(FRC) From 95da5141f0cb2c7a8aaf553f84287aef8eccf21f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Nov 2022 22:21:29 +0100 Subject: [PATCH 093/154] Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651) --- lapack-netlib/SRC/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 03d15c23c..a5d5acdf2 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o + sgesvdq.o slarmm.o slarts3.o strsyl3.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o + cgesvdq.o clarts3.o ctrsyl3.o endif ifdef USEXBLAS @@ -417,7 +417,7 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o + dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o endif ifdef USEXBLAS @@ -526,7 +526,7 @@ ZLASRC_O = \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ - zgesvdq.o + zgesvdq.o zlatrs3.o ztrsyl3.o endif ifdef USEXBLAS From 379efbe5af02059375c1eb1d312834789e17d13e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 Nov 2022 11:03:12 +0100 Subject: [PATCH 094/154] Fix typos --- lapack-netlib/SRC/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index a5d5acdf2..49eb69cfe 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o slarmm.o slarts3.o strsyl3.o + sgesvdq.o slarmm.o slatrs3.o strsyl3.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o clarts3.o ctrsyl3.o + cgesvdq.o clatrs3.o ctrsyl3.o endif ifdef USEXBLAS From b2cc310470a91165cda1e4da17426c4babf1845a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 Nov 2022 14:23:46 +0100 Subject: [PATCH 095/154] Add f2c-converted versions of the new BLAS3-based Sylvester solver --- lapack-netlib/SRC/clatrs3.c | 1155 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/ctrsyl3.c | 381 ++++++++++++ lapack-netlib/SRC/dlarmm.c | 478 +++++++++++++++ lapack-netlib/SRC/dlatrs3.c | 1138 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dtrsyl3.c | 381 ++++++++++++ lapack-netlib/SRC/slarmm.c | 478 +++++++++++++++ lapack-netlib/SRC/slatrs3.c | 1135 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/strsyl3.c | 381 ++++++++++++ lapack-netlib/SRC/zlatrs3.c | 1157 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/ztrsyl3.c | 381 ++++++++++++ 10 files changed, 7065 insertions(+) create mode 100644 lapack-netlib/SRC/clatrs3.c create mode 100644 lapack-netlib/SRC/ctrsyl3.c create mode 100644 lapack-netlib/SRC/dlarmm.c create mode 100644 lapack-netlib/SRC/dlatrs3.c create mode 100644 lapack-netlib/SRC/dtrsyl3.c create mode 100644 lapack-netlib/SRC/slarmm.c create mode 100644 lapack-netlib/SRC/slatrs3.c create mode 100644 lapack-netlib/SRC/strsyl3.c create mode 100644 lapack-netlib/SRC/zlatrs3.c create mode 100644 lapack-netlib/SRC/ztrsyl3.c diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c new file mode 100644 index 000000000..6124a7f19 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.c @@ -0,0 +1,1155 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int clatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, complex *a, integer *lda, complex * + x, integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + complex q__1; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + real rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer lscale; + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + integer *, complex *, integer *, complex *, real *, real *, + integer *); + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + clatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = clange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + csscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = clange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + csscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + csscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__6, &i__7, &i__8, &q__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("T", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of CLATRS3 */ + +} /* clatrs3_ */ + diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c new file mode 100644 index 000000000..d05923a46 --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.c @@ -0,0 +1,381 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLARMM */ + +/* Definition: */ +/* =========== */ + +/* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) */ + +/* DOUBLE PRECISION ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is DOUBLE PRECISION */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is DOUBLE PRECISION */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +doublereal dlarmm_(doublereal *anorm, doublereal *bnorm, doublereal *cnorm) +{ + /* System generated locals */ + doublereal ret_val; + + /* Local variables */ + extern doublereal dlamch_(char *); + doublereal bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = dlamch_("Safe minimum") / dlamch_("Precision"); + bignum = 1. / smlnum / 4.; + +/* Compute a scale factor. */ + + ret_val = 1.; + if (*bnorm <= 1.) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5 / *bnorm; + } + } + return ret_val; + +/* ==== End of DLARMM ==== */ + +} /* dlarmm_ */ + diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c new file mode 100644 index 000000000..b6e15eb12 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.c @@ -0,0 +1,1138 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is DOUBLE PRECISION array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int dlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublereal *a, integer *lda, + doublereal *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *); + doublereal w[64]; + extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + integer *, doublereal *, integer *, doublereal *, doublereal *, + doublereal *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I+KK*LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + dlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = dlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + dscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = dlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + dscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + dscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of DLATRS3 */ + +} /* dlatrs3_ */ + diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c new file mode 100644 index 000000000..d05923a46 --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.c @@ -0,0 +1,381 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLARMM */ + +/* Definition: */ +/* =========== */ + +/* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) */ + +/* REAL ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is REAL */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is REAL */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +real slarmm_(real *anorm, real *bnorm, real *cnorm) +{ + /* System generated locals */ + real ret_val; + + /* Local variables */ + extern real slamch_(char *); + real bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = slamch_("Safe minimum") / slamch_("Precision"); + bignum = 1.f / smlnum / 4.f; + +/* Compute a scale factor. */ + + ret_val = 1.f; + if (*bnorm <= 1.f) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5f; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5f / *bnorm; + } + } + return ret_val; + +/* ==== End of SLARMM ==== */ + +} /* slarmm_ */ + diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c new file mode 100644 index 000000000..2d8c0ab33 --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.c @@ -0,0 +1,1135 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is REAL array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int slatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, real *a, integer *lda, real *x, + integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern logical lsame_(char *, char *); + real rscal; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk, lscale; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + integer *, real *, integer *, real *, real *, real *, integer *); + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "SLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + slatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = slange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + sscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = slange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + sscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + sscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of SLATRS3 */ + +} /* slatrs3_ */ + diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c new file mode 100644 index 000000000..d05923a46 --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.c @@ -0,0 +1,381 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX*16 A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX*16 array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int zlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublecomplex *a, integer *lda, + doublecomplex *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + doublereal w[64]; + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit; + extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + integer *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublereal *, integer *); + logical lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + zlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = zlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + zdscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = zlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + zdscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + zdscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__6, &i__7, &i__8, &z__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("T", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of ZLATRS3 */ + +} /* zlatrs3_ */ + diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c new file mode 100644 index 000000000..d05923a46 --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.c @@ -0,0 +1,381 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +typedef int integer; +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimag(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) ceil(w) +#define myhuge_(w) HUGE_VAL +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; + _Complex float zdotc = 0.0; + if (incx == 1 && incy == 1) { + for (i=0;i Date: Tue, 15 Nov 2022 16:26:44 +0100 Subject: [PATCH 096/154] Add f2c-converted files for the BLAS3-based Sylvester solver --- cmake/lapack.cmake | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index a78a89f1a..82511d41b 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -621,7 +621,8 @@ set(SLASRC ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c sgesvdq.c slaorhr_col_getrfnp.c - slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c ) + slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c + slarmm.c slatrs3.c strsyl3.c) set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c @@ -718,7 +719,8 @@ set(CLASRC cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c - cungtsqr.c cungtsqr_row.c cunhr_col.c ) + cungtsqr.c cungtsqr_row.c cunhr_col.c + clatrs3.c ctrsyl3.c) set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c @@ -809,7 +811,8 @@ set(DLASRC dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c - dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c ) + dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c + dlarmm.c dlatrs3.c dtrsyl3.c) set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c @@ -910,7 +913,7 @@ set(ZLASRC zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c) + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c) set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c From 5dec93e93b38954154f3a8e12c905be101eddbe9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 Nov 2022 20:36:58 +0100 Subject: [PATCH 097/154] Complete the C conversion of the xTRSYL3 files --- lapack-netlib/SRC/ctrsyl3.c | 1518 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dtrsyl3.c | 1556 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/strsyl3.c | 1561 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/ztrsyl3.c | 1519 ++++++++++++++++++++++++++++++++++ 4 files changed, 6154 insertions(+) diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c index d05923a46..70f265a14 100644 --- a/lapack-netlib/SRC/ctrsyl3.c +++ b/lapack-netlib/SRC/ctrsyl3.c @@ -157,6 +157,7 @@ struct Namelist { }; typedef struct Namelist Namelist; +#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define myhuge_(w) HUGE_VAL //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -379,3 +382,1518 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ pCd(z) = zdotc; } #endif +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static integer c__1 = 1; +static integer c_n1 = -1; +static real c_b18 = 2.f; +static real c_b106 = 1.f; + +/* > \brief \b CTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complexSYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int ctrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, complex *a, integer *lda, complex *b, integer + *ldb, complex *c__, integer *ldc, real *scale, real *swork, integer * + ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3, r__4; + complex q__1; + + /* Local variables */ + real scal; + complex csgn; + real anrm, bnrm, cnrm; + integer awrk, bwrk; + real *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(real *); + integer nb, jj, ll; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + extern /* Subroutine */ int ctrsyl_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, real *, integer *); + logical lquery; + integer nba, nbb; + real buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (real*)malloc(f2cmax(*m,*n)*sizeof(real)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ctrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + q__1.r = sgn, q__1.i = 0.f; + csgn.r = q__1.r, csgn.i = q__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = clange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = clange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__4, &i__5, &i__6, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = clange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__4, &i__5, &i__6, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = clange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + csscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__3, &i__4, &i__5, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = clange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = clange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__2, &i__3, &i__4, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + r__3 = (r__1 = c__[i__1].r, abs(r__1)), r__4 = (r__2 = r_imag(&c__[ + c_dim1 + 1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + r__3 = scal, r__4 = (r__1 = c__[i__3].r, abs(r__1)), r__3 = + f2cmax(r__3,r__4), r__4 = (r__2 = r_imag(&c__[k + l * + c_dim1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + clascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return 0; + +/* End of CTRSYL3 */ + +} /* ctrsyl3_ */ + diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c index d05923a46..199baab75 100644 --- a/lapack-netlib/SRC/dtrsyl3.c +++ b/lapack-netlib/SRC/dtrsyl3.c @@ -157,6 +157,7 @@ struct Namelist { }; typedef struct Namelist Namelist; +#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define myhuge_(w) HUGE_VAL //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -379,3 +382,1556 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ pCd(z) = zdotc; } #endif +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c__1 = 1; +static integer c_n1 = -1; +static doublereal c_b19 = 2.; +static doublereal c_b31 = -1.; +static doublereal c_b32 = 1.; + +/* > \brief \b DTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DTRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by DHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is DOUBLE PRECISION array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int dtrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, + integer *ldb, doublereal *c__, integer *ldc, doublereal *scale, + integer *iwork, integer *liwork, doublereal *swork, integer *ldswork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3; + + /* Local variables */ + doublereal scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *), dgemm_(char *, char *, integer *, integer *, integer * + , doublereal *, doublereal *, integer *, doublereal *, integer *, + doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(doublereal *); + integer l2, nb, pc, jj, ll; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int dtrsyl_(char *, char *, integer *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + dtrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0. && a[l + 1 + l * a_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0. && b[l + 1 + l * b_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = dlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = dlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + dgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = dlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__4, &i__5, &i__6, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = dlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + dscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + dgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = dlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = dlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__2, &i__3, &i__4, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + d__2 = scal, d__3 = (d__1 = c__[k + l * c_dim1], abs(d__1)); + scal = f2cmax(d__2,d__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + dlascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return 0; + +/* End of DTRSYL3 */ + +} /* dtrsyl3_ */ + diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c index d05923a46..85d68e017 100644 --- a/lapack-netlib/SRC/strsyl3.c +++ b/lapack-netlib/SRC/strsyl3.c @@ -157,6 +157,7 @@ struct Namelist { }; typedef struct Namelist Namelist; +#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define myhuge_(w) HUGE_VAL //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) +static int my_expfunc(float* x) {int e; (void)frexpf(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -379,3 +382,1561 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ pCd(z) = zdotc; } #endif +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c__1 = 1; +static integer c_n1 = -1; +static real c_b19 = 2.f; +static real c_b31 = -1.f; +static real c_b32 = 1.f; + +/* > \brief \b STRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > STRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by SHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is REAL array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int strsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, + real *c__, integer *ldc, real *scale, integer *iwork, integer *liwork, + real *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3; + + /* Local variables */ + real scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + real *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(real *); + integer l2, nb, pc, jj, ll; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *); + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + logical lquery; + extern /* Subroutine */ int strsyl_(char *, char *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, integer * + , real *, integer *); + integer nba, nbb; + real buf, sgn; + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "STRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } else if (! lquery && *liwork < iwork[1]) { + *info = -14; + } else if (! lquery && *ldswork < f2cmax(nba,nbb)) { + *info = -16; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("STRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return 0; + } + +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + strsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + + +/* REAL WNRM( MAX( M, N ) ) */ + wnrm=(real*)malloc (f2cmax(*m,*n)*sizeof(real)); + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0.f && a[l + 1 + l * a_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0.f && b[l + 1 + l * b_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = slange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = slange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + sgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = slange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__4, &i__5, &i__6, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = slange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + sscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + sgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = slange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = slange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__2, &i__3, &i__4, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + r__2 = scal, r__3 = (r__1 = c__[k + l * c_dim1], abs(r__1)); + scal = f2cmax(r__2,r__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + slascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return 0; + +/* End of STRSYL3 */ + +} /* strsyl3_ */ + diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c index d05923a46..c1be7d589 100644 --- a/lapack-netlib/SRC/ztrsyl3.c +++ b/lapack-netlib/SRC/ztrsyl3.c @@ -157,6 +157,7 @@ struct Namelist { }; typedef struct Namelist Namelist; +#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define myhuge_(w) HUGE_VAL //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -379,3 +382,1519 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ pCd(z) = zdotc; } #endif +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static integer c__1 = 1; +static integer c_n1 = -1; +static doublereal c_b18 = 2.; +static doublereal c_b106 = 1.; + +/* > \brief \b ZTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX*16 array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complex16SYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int ztrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex + *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *scale, + doublereal *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3, d__4; + doublecomplex z__1; + + /* Local variables */ + doublereal scal; + doublecomplex csgn; + doublereal anrm, bnrm, cnrm; + integer awrk, bwrk; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(doublereal *); + integer nb, jj, ll; + extern doublereal dlamch_(char *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex * + , integer *, integer *); + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int ztrsyl_(char *, char *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ztrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + z__1.r = sgn, z__1.i = 0.; + csgn.r = z__1.r, csgn.i = z__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = zlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = zlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__4, &i__5, &i__6, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = zlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__4, &i__5, &i__6, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = zlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + zdscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__3, &i__4, &i__5, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = zlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = zlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__2, &i__3, &i__4, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + d__3 = (d__1 = c__[i__1].r, abs(d__1)), d__4 = (d__2 = d_imag(&c__[ + c_dim1 + 1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + d__3 = scal, d__4 = (d__1 = c__[i__3].r, abs(d__1)), d__3 = + f2cmax(d__3,d__4), d__4 = (d__2 = d_imag(&c__[k + l * + c_dim1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + zlascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return 0; + +/* End of ZTRSYL3 */ + +} /* ztrsyl3_ */ + From 2a97ca615f373d7385df3becf97d1295cc3fc29e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Nov 2022 07:36:40 +0100 Subject: [PATCH 098/154] MSVC compatibility fixes --- lapack-netlib/SRC/clatrs3.c | 157 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/ctrsyl3.c | 155 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/dlarmm.c | 157 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/dlatrs3.c | 157 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/dtrsyl3.c | 155 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/slarmm.c | 157 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/slatrs3.c | 157 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/strsyl3.c | 158 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/zlatrs3.c | 156 +++++++++++++++++++++++++++++++---- lapack-netlib/SRC/ztrsyl3.c | 159 ++++++++++++++++++++++++++++++++---- 10 files changed, 1413 insertions(+), 155 deletions(-) diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c index 6124a7f19..f6d76cf49 100644 --- a/lapack-netlib/SRC/clatrs3.c +++ b/lapack-netlib/SRC/clatrs3.c @@ -1,12 +1,3 @@ -/* f2c.h -- Standard Fortran to C header file */ - -/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." - - - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ - -#ifndef F2C_INCLUDE -#define F2C_INCLUDE - #include #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -170,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -183,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ @@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -157,7 +176,6 @@ struct Namelist { }; typedef struct Namelist Namelist; -#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -171,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -184,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -230,13 +253,14 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) #define myexp_(w) my_expfunc(w) static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -270,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -282,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -294,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -327,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -170,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -183,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ @@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLARMM */ /* Definition: */ diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c index b6e15eb12..46eca6379 100644 --- a/lapack-netlib/SRC/dlatrs3.c +++ b/lapack-netlib/SRC/dlatrs3.c @@ -1,12 +1,3 @@ -/* f2c.h -- Standard Fortran to C header file */ - -/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." - - - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ - -#ifndef F2C_INCLUDE -#define F2C_INCLUDE - #include #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -170,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -183,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ @@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -157,7 +176,6 @@ struct Namelist { }; typedef struct Namelist Namelist; -#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -171,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -184,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -230,13 +253,14 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) #define myexp_(w) my_expfunc(w) static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -270,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -282,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -294,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -327,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -170,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -183,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ @@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLARMM */ /* Definition: */ diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c index 2d8c0ab33..e5c48a55b 100644 --- a/lapack-netlib/SRC/slatrs3.c +++ b/lapack-netlib/SRC/slatrs3.c @@ -1,12 +1,3 @@ -/* f2c.h -- Standard Fortran to C header file */ - -/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." - - - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ - -#ifndef F2C_INCLUDE -#define F2C_INCLUDE - #include #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -170,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -183,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ @@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -157,7 +176,6 @@ struct Namelist { }; typedef struct Namelist Namelist; -#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -171,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -184,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -230,13 +253,14 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) #define myexp_(w) my_expfunc(w) -static int my_expfunc(float* x) {int e; (void)frexpf(*x,&e); return e;} +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -270,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -282,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -294,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -327,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -170,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -183,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} /* procedure parameter types for -A and -C++ */ @@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i #include #include @@ -19,7 +10,28 @@ #undef I #endif -typedef int integer; +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + typedef unsigned int uinteger; typedef char *address; typedef short int shortint; @@ -27,10 +39,17 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif #define pCf(z) (*_pCf(z)) #define pCd(z) (*_pCd(z)) typedef int logical; @@ -157,7 +176,6 @@ struct Namelist { }; typedef struct Namelist Namelist; -#define exponent(x) #define abs(x) ((x) >= 0 ? (x) : -(x)) #define dabs(x) (fabs(x)) #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) @@ -171,8 +189,13 @@ typedef struct Namelist Namelist; #define abort_() { sig_die("Fortran abort routine called", 1); } #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} @@ -184,13 +207,13 @@ typedef struct Namelist Namelist; #define d_atan(x) (atan(*(x))) #define d_atn2(x, y) (atan2(*(x),*(y))) #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } -#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } #define d_cos(x) (cos(*(x))) #define d_cosh(x) (cosh(*(x))) #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) #define d_exp(x) (exp(*(x))) #define d_imag(z) (cimag(Cd(z))) -#define r_imag(z) (cimag(Cf(z))) +#define r_imag(z) (cimagf(Cf(z))) #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) @@ -230,13 +253,15 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; #define mycycle_() continue; -#define myceiling_(w) ceil(w) -#define myhuge_(w) HUGE_VAL +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) #define myexp_(w) my_expfunc(w) static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + + /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 @@ -270,6 +295,21 @@ static double dpow_ui(double x, integer n) { } return pow; } +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else static _Complex float cpow_ui(_Complex float x, integer n) { _Complex float pow=1.0; unsigned long int u; if(n != 0) { @@ -282,6 +322,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) { } return pow; } +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else static _Complex double zpow_ui(_Complex double x, integer n) { _Complex double pow=1.0; unsigned long int u; if(n != 0) { @@ -294,6 +350,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) { } return pow; } +#endif static integer pow_ii(integer x, integer n) { integer pow; unsigned long int u; if (n <= 0) { @@ -327,6 +384,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) } static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i Date: Wed, 16 Nov 2022 21:57:42 +0100 Subject: [PATCH 099/154] change line endings from CRLF to LF --- cmake/lapack.cmake | 2028 ++++++++++++++++++++++---------------------- 1 file changed, 1014 insertions(+), 1014 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 82511d41b..ca3a1e184 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,1014 +1,1014 @@ -# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. -if (NOT C_LAPACK) - message (STATUS "fortran lapack") -set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90 - ../INSTALL/ilaver.f xerbla_array.f - ../INSTALL/slamch.f) - -set(SCLAUX - scombssq.f sbdsvdx.f sstevx.f sstein.f - la_constants.f90 - sbdsdc.f - sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f - slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f - slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f - slagts.f slamrg.f slanst.f - slapy2.f slapy3.f slarnv.f - slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f - slarrk.f slarrr.f slaneg.f - slartg.f90 slaruv.f slas2.f slascl.f - slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f - slasd7.f slasd8.f slasda.f slasdq.f slasdt.f - slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f - slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f - ssteqr.f ssterf.f slaisnan.f sisnan.f - slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f - ../INSTALL/second_${TIMER}.f) - -set(DZLAUX - la_constants.f90 - dbdsdc.f - dbdsvdx.f dstevx.f dstein.f - dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f - dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f - dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f - dlagts.f dlamrg.f dlanst.f - dlapy2.f dlapy3.f dlarnv.f - dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f - dlarrk.f dlarrr.f dlaneg.f - dlartg.f90 dlaruv.f dlas2.f dlascl.f - dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f - dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f - dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f - dsteqr.f dsterf.f dlaisnan.f disnan.f - dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f - ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) - -set(SLASRC - sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f - sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f - sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f - sgehd2.f sgehrd.f sgelq2.f sgelqf.f - sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f - sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f - sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f - sgetrf2.f sgetri.f - sggbak.f sggbal.f - sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f - sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f - sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f - sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f - shsein.f shseqr.f slabrd.f slacon.f slacn2.f - slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f - slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f - slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f - slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f - slansy.f slantb.f slantp.f slantr.f slanv2.f - slapll.f slapmt.f - slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f - slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f - slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f - slarrv.f slartv.f - slarz.f slarzb.f slarzt.f slasy2.f - slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f - slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f - sopgtr.f sopmtr.f sorg2l.f sorg2r.f - sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f - sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f - sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f - sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f - spbstf.f spbsv.f spbsvx.f - spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f - sposvx.f spotrf2.f spotri.f spstrf.f spstf2.f - sppcon.f sppequ.f - spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f - spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f - ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f - ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f - sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f - ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f - ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f - ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f - ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f - ssyswapr.f ssytrs.f ssytrs2.f - ssyconv.f ssyconvf.f ssyconvf_rook.f - ssysv_aa.f ssysv_aa_2stage.f ssytrf_aa.f ssytrf_aa_2stage.f ssytrs_aa.f ssytrs_aa_2stage.f - ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f - ssytri_rook.f ssycon_rook.f ssysv_rook.f - ssytf2_rk.f ssytrf_rk.f ssytrs_3.f - ssytri_3.f ssytri_3x.f ssycon_3.f ssysv_rk.f - ssysv_aa.f ssytrf_aa.f ssytrs_aa.f - stbcon.f - stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f - stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f - stptrs.f - strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f - strtrs.f stzrzf.f sstemr.f - slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f - stfttr.f stpttf.f stpttr.f strttf.f strttp.f - sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f - sgeequb.f ssyequb.f spoequb.f sgbequb.f - sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f - sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f - sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f - stpqrt.f stpqrt2.f stpmqrt.f stprfb.f - sgelqt.f sgelqt3.f sgemlqt.f - sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f - sgelq.f slaswlq.f slamswlq.f sgemlq.f - stplqt.f stplqt2.f stpmlqt.f - ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f - ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f - ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f - sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f - slarmm.f slatrs3.f strsyl3.f) - -set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f - sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f - sla_syrfsx_extended.f sla_syamv.f sla_syrcond.f sla_syrpvgrw.f - sposvxx.f sporfsx.f sla_porfsx_extended.f sla_porcond.f - sla_porpvgrw.f sgbsvxx.f sgbrfsx.f sla_gbrfsx_extended.f - sla_gbamv.f sla_gbrcond.f sla_gbrpvgrw.f sla_lin_berr.f slarscl2.f - slascl2.f sla_wwaddw.f) - -set(CLASRC - cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f - cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f - cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f - cgehd2.f cgehrd.f cgelq2.f cgelqf.f - cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f - cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f - cgesc2.f cgesdd.f cgesvd.f cgesvdx.f - cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f - cgesvx.f cgetc2.f cgetrf2.f - cgetri.f - cggbak.f cggbal.f - cgges.f cgges3.f cggesx.f cggev.f cggev3.f cggevx.f - cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f - cggsvd3.f cggsvp3.f - cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f - chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f - checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f - chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f - chetf2.f chetrd.f - chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f - chetrs.f chetrs2.f - chetf2_rook.f chetrf_rook.f chetri_rook.f - chetrs_rook.f checon_rook.f chesv_rook.f - chetf2_rk.f chetrf_rk.f chetri_3.f chetri_3x.f - chetrs_3.f checon_3.f chesv_rk.f - chesv_aa.f chesv_aa_2stage.f chetrf_aa.f chetrf_aa_2stage.f chetrs_aa.f chetrs_aa_2stage.f - chgeqz.f chpcon.f chpev.f chpevd.f - chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f - chpsvx.f - chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f - clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f - claed0.f claed7.f claed8.f - claein.f claesy.f claev2.f clags2.f clagtm.f - clahef.f clahef_rook.f clahef_rk.f clahef_aa.f clahqr.f - clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f - clanhb.f clanhe.f - clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f - clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f - claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f - claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f - claqz0.f claqz1.f claqz2.f claqz3.f - claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f - clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f - clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 - clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f - clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f - cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f - cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f - cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f - cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f - cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f - crot.f cspcon.f csprfs.f cspsv.f - cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f - cstegr.f cstein.f csteqr.f csycon.f - csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f - csytri2.f csytri2x.f csyswapr.f - csytrs.f csytrs2.f - csyconv.f csyconvf.f csyconvf_rook.f - csytf2_rook.f csytrf_rook.f csytrs_rook.f - csytri_rook.f csycon_rook.f csysv_rook.f - csytf2_rk.f csytrf_rk.f csytrf_aa.f csytrf_aa_2stage.f csytrs_3.f csytrs_aa.f csytrs_aa_2stage.f - csytri_3.f csytri_3x.f csycon_3.f csysv_rk.f csysv_aa.f csysv_aa_2stage.f - ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f - ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f - ctprfs.f ctptri.f - ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f - ctrsyl.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f - cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f - cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f - cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f - cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f - chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f - ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f - cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f - cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f - cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f - cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f - ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f - cgelqt.f cgelqt3.f cgemlqt.f - cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f - cgelq.f claswlq.f clamswlq.f cgemlq.f - ctplqt.f ctplqt2.f ctpmlqt.f - chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f - cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f - chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f - cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cungtsqr_row.f cunhr_col.f - clatrs3.f ctrsyl3.f ) - -set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f - cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f - csysvxx.f csyrfsx.f cla_syrfsx_extended.f cla_syamv.f - cla_syrcond_c.f cla_syrcond_x.f cla_syrpvgrw.f - cposvxx.f cporfsx.f cla_porfsx_extended.f - cla_porcond_c.f cla_porcond_x.f cla_porpvgrw.f - cgbsvxx.f cgbrfsx.f cla_gbrfsx_extended.f cla_gbamv.f - cla_gbrcond_c.f cla_gbrcond_x.f cla_gbrpvgrw.f - chesvxx.f cherfsx.f cla_herfsx_extended.f cla_heamv.f - cla_hercond_c.f cla_hercond_x.f cla_herpvgrw.f - cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) - -set(DLASRC - dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f - dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f - dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f - dgehd2.f dgehrd.f dgelq2.f dgelqf.f - dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f - dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f - dgetrf2.f dgetri.f - dggbak.f dggbal.f - dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f - dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f - dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f - dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f - dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f - dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f - dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f - dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f - dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f - dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f - dlapll.f dlapmt.f - dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f - dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f - dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f - dlargv.f dlarrv.f dlartv.f - dlarz.f dlarzb.f dlarzt.f dlasy2.f - dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f - dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f - dopgtr.f dopmtr.f dorg2l.f dorg2r.f - dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f - dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f - dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f - dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f - dpbstf.f dpbsv.f dpbsvx.f - dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f - dposvx.f dpotrf2.f dpotri.f dpotrs.f dpstrf.f dpstf2.f - dppcon.f dppequ.f - dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f - dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f - dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f - dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f - dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f - dsycon.f dsyev.f dsyevd.f dsyevr.f - dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f - dsysv.f dsysvx.f - dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f - dsytri2.f dsytri2x.f dsyswapr.f - dsyconv.f dsyconvf.f dsyconvf_rook.f - dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f - dsytri_rook.f dsycon_rook.f dsysv_rook.f - dsytf2_rk.f dsytrf_rk.f dsytrs_3.f - dsytri_3.f dsytri_3x.f dsycon_3.f dsysv_rk.f - dsysv_aa.f dsysv_aa_2stage.f dsytrf_aa.f dsytrf_aa_2stage.f dsytrs_aa.f dsytrs_aa_2stage.f - dtbcon.f - dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f - dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f - dtptrs.f - dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrtrs.f dtzrzf.f dstemr.f - dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f - dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f - dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f - dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f - dgeequb.f dsyequb.f dpoequb.f dgbequb.f - dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f - dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f - dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f - dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f - dgelqt.f dgelqt3.f dgemlqt.f - dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f - dgelq.f dlaswlq.f dlamswlq.f dgemlq.f - dtplqt.f dtplqt2.f dtpmlqt.f - dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f - dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f - dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f - dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f - dlarmm.f dlatrs3.f dtrsyl3.f) - -set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f - dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f - dla_syrfsx_extended.f dla_syamv.f dla_syrcond.f dla_syrpvgrw.f - dposvxx.f dporfsx.f dla_porfsx_extended.f dla_porcond.f - dla_porpvgrw.f dgbsvxx.f dgbrfsx.f dla_gbrfsx_extended.f - dla_gbamv.f dla_gbrcond.f dla_gbrpvgrw.f dla_lin_berr.f dlarscl2.f - dlascl2.f dla_wwaddw.f) - -set(ZLASRC - zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f - zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f - zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f - zgehd2.f zgehrd.f zgelq2.f zgelqf.f - zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f - zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f - zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f - zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f - zgetc2.f zgetrf2.f - zgetri.f - zggbak.f zggbal.f - zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f - zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f - zggsvd3.f zggsvp3.f - zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f - zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f - zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f - zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f - zhetf2.f zhetrd.f - zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f - zhetrs.f zhetrs2.f - zhetf2_rook.f zhetrf_rook.f zhetri_rook.f - zhetrs_rook.f zhecon_rook.f zhesv_rook.f - zhetf2_rk.f zhetrf_rk.f zhetri_3.f zhetri_3x.f - zhetrs_3.f zhecon_3.f zhesv_rk.f - zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f - zhgeqz.f zhpcon.f zhpev.f zhpevd.f - zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f - zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f - zhpsvx.f - zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f - zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f - zlaed0.f zlaed7.f zlaed8.f - zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f - zlahef.f zlahef_rook.f zlahef_rk.f zlahef_aa.f zlahqr.f - zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f - zlangt.f zlanhb.f - zlanhe.f - zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f - zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f - zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f - zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f - zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f - zlarfg.f zlarfgp.f zlarft.f - zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f - zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f - zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f - zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f - zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f - zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f - zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f - zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f - zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f - zrot.f zspcon.f zsprfs.f zspsv.f - zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f - zstegr.f zstein.f zsteqr.f zsycon.f - zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f - zsytri2.f zsytri2x.f zsyswapr.f - zsytrs.f zsytrs2.f - zsyconv.f zsyconvf.f zsyconvf_rook.f - zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytrs_aa.f zsytrs_aa_2stage.f - zsytri_rook.f zsycon_rook.f zsysv_rook.f - zsytf2_rk.f zsytrf_rk.f zsytrf_aa.f zsytrf_aa_2stage.f zsytrs_3.f - zsytri_3.f zsytri_3x.f zsycon_3.f zsysv_rk.f zsysv_aa.f zsysv_aa_2stage.f - ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f - ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f - ztprfs.f ztptri.f - ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f - ztrsyl.f ztrtrs.f ztzrzf.f zung2l.f - zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f - zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f - zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f - zunmtr.f zupgtr.f - zupmtr.f izmax1.f dzsum1.f zstemr.f - zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f - zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f - ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f - zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f - zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f - zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f - zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f - ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f - ztplqt.f ztplqt2.f ztpmlqt.f - zgelqt.f zgelqt3.f zgemlqt.f - zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f - zgelq.f zlaswlq.f zlamswlq.f zgemlq.f - zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f - zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f - zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f - zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zungtsqr_row.f zunhr_col.f - zlatrs3.f ztrsyl3.f) - -set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f - zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f - zla_syrfsx_extended.f zla_syamv.f zla_syrcond_c.f zla_syrcond_x.f - zla_syrpvgrw.f zposvxx.f zporfsx.f zla_porfsx_extended.f - zla_porcond_c.f zla_porcond_x.f zla_porpvgrw.f zgbsvxx.f zgbrfsx.f - zla_gbrfsx_extended.f zla_gbamv.f zla_gbrcond_c.f zla_gbrcond_x.f - zla_gbrpvgrw.f zhesvxx.f zherfsx.f zla_herfsx_extended.f - zla_heamv.f zla_hercond_c.f zla_hercond_x.f zla_herpvgrw.f - zla_lin_berr.f zlarscl2.f zlascl2.f zla_wwaddw.f) - - -if(USE_XBLAS) - set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) -endif() - -list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f - DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f - DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) -list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f - DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f - DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f) -list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f - DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f - DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f) -list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f - DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f - DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) -message(STATUS "Building deprecated routines") - -set(DSLASRC spotrs.f) - -set(ZCLASRC cpotrs.f) - -set(SCATGEN slatm1.f slaran.f slarnd.f) - -set(SMATGEN slatms.f slatme.f slatmr.f slatmt.f - slagge.f slagsy.f slakf2.f slarge.f slaror.f slarot.f slatm2.f - slatm3.f slatm5.f slatm6.f slatm7.f slahilb.f) - -set(CMATGEN clatms.f clatme.f clatmr.f clatmt.f - clagge.f claghe.f clagsy.f clakf2.f clarge.f claror.f clarot.f - clatm1.f clarnd.f clatm2.f clatm3.f clatm5.f clatm6.f clahilb.f slatm7.f) - -set(DZATGEN dlatm1.f dlaran.f dlarnd.f) - -set(DMATGEN dlatms.f dlatme.f dlatmr.f dlatmt.f - dlagge.f dlagsy.f dlakf2.f dlarge.f dlaror.f dlarot.f dlatm2.f - dlatm3.f dlatm5.f dlatm6.f dlatm7.f dlahilb.f) - -set(ZMATGEN zlatms.f zlatme.f zlatmr.f zlatmt.f - zlagge.f zlaghe.f zlagsy.f zlakf2.f zlarge.f zlaror.f zlarot.f - zlatm1.f zlarnd.f zlatm2.f zlatm3.f zlatm5.f zlatm6.f zlahilb.f dlatm7.f) - -if(BUILD_SINGLE) - set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) - set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision") -endif() -if(BUILD_DOUBLE) - set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) - set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) - message(STATUS "Building Double Precision") -endif() -if(BUILD_COMPLEX) - set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision Complex") -endif() -if(BUILD_COMPLEX16) - set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) -# for zlange/zlanhe - if (NOT BUILD_DOUBLE) - set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) - endif () - message(STATUS "Building Double Precision Complex") -endif() - -else () - - message (STATUS "c lapack") -set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c - ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c - ../INSTALL/ilaver.c xerbla_array.c - ../INSTALL/slamch.c) - -set(SCLAUX - scombssq.c sbdsvdx.c sstevx.c sstein.c - sbdsdc.c - sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c - slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c - slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c - slagts.c slamrg.c slanst.c - slapy2.c slapy3.c slarnv.c - slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c - slarrk.c slarrr.c slaneg.c - slartg.c slaruv.c slas2.c slascl.c - slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c - slasd7.c slasd8.c slasda.c slasdq.c slasdt.c - slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c - slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c - ssteqr.c ssterf.c slaisnan.c sisnan.c - slartgp.c slartgs.c - ../INSTALL/second_${TIMER}.c) - -set(DZLAUX - dbdsdc.c - dbdsvdx.c dstevx.c dstein.c - dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c - dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c - dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c - dlagts.c dlamrg.c dlanst.c - dlapy2.c dlapy3.c dlarnv.c - dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c - dlarrk.c dlarrr.c dlaneg.c - dlartg.c dlaruv.c dlas2.c dlascl.c - dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c - dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c - dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c - dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c - dsteqr.c dsterf.c dlaisnan.c disnan.c - dlartgp.c dlartgs.c - ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c) - -set(SLASRC - sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c - sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c - sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c - sgehd2.c sgehrd.c sgelq2.c sgelqf.c - sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c - sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c - sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c - sgetrf2.c sgetri.c - sggbak.c sggbal.c - sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c - sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c - sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c - sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c - shsein.c shseqr.c slabrd.c slacon.c slacn2.c - slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c - slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c - slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c - slansy.c slantb.c slantp.c slantr.c slanv2.c - slapll.c slapmt.c - slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c - slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c - slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c - slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c - slarrv.c slartv.c - slarz.c slarzb.c slarzt.c slasy2.c - slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c - slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c - sopgtr.c sopmtr.c sorg2l.c sorg2r.c - sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c - sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c - sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c - sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c - spbstf.c spbsv.c spbsvx.c - spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c - sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c - sppcon.c sppequ.c - spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c - spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c - ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c - ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c - sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c - ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c - ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c - ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c - ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c - ssyswapr.c ssytrs.c ssytrs2.c - ssyconv.c ssyconvf.c ssyconvf_rook.c - ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c - ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c - ssytri_rook.c ssycon_rook.c ssysv_rook.c - ssytf2_rk.c ssytrf_rk.c ssytrs_3.c - ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c - ssysv_aa.c ssytrf_aa.c ssytrs_aa.c - stbcon.c - stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c - stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c - stptrs.c - strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c - strtrs.c stzrzf.c sstemr.c - slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c - stfttr.c stpttf.c stpttr.c strttf.c strttp.c - sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c - sgeequb.c ssyequb.c spoequb.c sgbequb.c - sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c - sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c - sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c - stpqrt.c stpqrt2.c stpmqrt.c stprfb.c - sgelqt.c sgelqt3.c sgemlqt.c - sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c - sgelq.c slaswlq.c slamswlq.c sgemlq.c - stplqt.c stplqt2.c stpmlqt.c - ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c - ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c - ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c - sgesvdq.c slaorhr_col_getrfnp.c - slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c - slarmm.c slatrs3.c strsyl3.c) - -set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c - sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c - sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c - sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c - sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c - sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c - slascl2.c sla_wwaddw.c) - -set(CLASRC - cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c - cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c - cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c - cgehd2.c cgehrd.c cgelq2.c cgelqf.c - cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c - cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c - cgesc2.c cgesdd.c cgesvd.c cgesvdx.c - cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c - cgesvx.c cgetc2.c cgetrf2.c - cgetri.c - cggbak.c cggbal.c - cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c - cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c - cggsvd3.c cggsvp3.c - cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c - chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c - checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c - chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c - chetf2.c chetrd.c - chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c - chetrs.c chetrs2.c - chetf2_rook.c chetrf_rook.c chetri_rook.c - chetrs_rook.c checon_rook.c chesv_rook.c - chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c - chetrs_3.c checon_3.c chesv_rk.c - chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c - chgeqz.c chpcon.c chpev.c chpevd.c - chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c - chpsvx.c - chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c - clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c - claed0.c claed7.c claed8.c - claein.c claesy.c claev2.c clags2.c clagtm.c - clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c - clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c - clanhb.c clanhe.c - clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c - clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c - claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c - claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c - claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c - clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c - clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c - clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c - clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c - clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c - cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c - cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c - cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c - cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c - cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c - crot.c cspcon.c csprfs.c cspsv.c - cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c - cstegr.c cstein.c csteqr.c csycon.c - csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c - csytri2.c csytri2x.c csyswapr.c - csytrs.c csytrs2.c - csyconv.c csyconvf.c csyconvf_rook.c - csytf2_rook.c csytrf_rook.c csytrs_rook.c - csytri_rook.c csycon_rook.c csysv_rook.c - csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c - csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c - ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c - ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c - ctprfs.c ctptri.c - ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c - ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c - cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c - cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c - cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c - cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c - chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c - ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c - cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c - cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c - cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c - cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c - ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c - cgelqt.c cgelqt3.c cgemlqt.c - cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c - cgelq.c claswlq.c clamswlq.c cgemlq.c - ctplqt.c ctplqt2.c ctpmlqt.c - chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c - cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c - chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c - cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c - cungtsqr.c cungtsqr_row.c cunhr_col.c - clatrs3.c ctrsyl3.c) - -set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c - cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c - csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c - cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c - cposvxx.c cporfsx.c cla_porfsx_extended.c - cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c - cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c - cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c - chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c - cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c - cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c) - -set(DLASRC - dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c - dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c - dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c - dgehd2.c dgehrd.c dgelq2.c dgelqf.c - dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c - dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c - dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c - dgetrf2.c dgetri.c - dggbak.c dggbal.c - dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c - dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c - dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c - dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c - dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c - dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c - dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c - dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c - dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c - dlapll.c dlapmt.c - dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c - dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c - dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c - dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c - dlargv.c dlarrv.c dlartv.c - dlarz.c dlarzb.c dlarzt.c dlasy2.c - dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c - dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c - dopgtr.c dopmtr.c dorg2l.c dorg2r.c - dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c - dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c - dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c - dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c - dpbstf.c dpbsv.c dpbsvx.c - dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c - dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c - dppcon.c dppequ.c - dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c - dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c - dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c - dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c - dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c - dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c - dsycon.c dsyev.c dsyevd.c dsyevr.c - dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c - dsysv.c dsysvx.c - dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c - dsytri2.c dsytri2x.c dsyswapr.c - dsyconv.c dsyconvf.c dsyconvf_rook.c - dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c - dsytri_rook.c dsycon_rook.c dsysv_rook.c - dsytf2_rk.c dsytrf_rk.c dsytrs_3.c - dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c - dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c - dtbcon.c - dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c - dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c - dtptrs.c - dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c - dtrtrs.c dtzrzf.c dstemr.c - dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c - dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c - dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c - dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c - dgeequb.c dsyequb.c dpoequb.c dgbequb.c - dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c - dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c - dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c - dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c - dgelqt.c dgelqt3.c dgemlqt.c - dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c - dgelq.c dlaswlq.c dlamswlq.c dgemlq.c - dtplqt.c dtplqt2.c dtpmlqt.c - dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c - dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c - dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c - dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c - dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c - dlarmm.c dlatrs3.c dtrsyl3.c) - -set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c - dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c - dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c - dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c - dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c - dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c - dlascl2.c dla_wwaddw.c) - -set(ZLASRC - zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c - zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c - zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c - zgehd2.c zgehrd.c zgelq2.c zgelqf.c - zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c - zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c - zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c - zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c - zgetc2.c zgetrf2.c - zgetri.c - zggbak.c zggbal.c - zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c - zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c - zggsvd3.c zggsvp3.c - zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c - zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c - zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c - zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c - zhetf2.c zhetrd.c - zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c - zhetrs.c zhetrs2.c - zhetf2_rook.c zhetrf_rook.c zhetri_rook.c - zhetrs_rook.c zhecon_rook.c zhesv_rook.c - zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c - zhetrs_3.c zhecon_3.c zhesv_rk.c - zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c - zhgeqz.c zhpcon.c zhpev.c zhpevd.c - zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c - zhpsvx.c - zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c - zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c - zlaed0.c zlaed7.c zlaed8.c - zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c - zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c - zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c - zlangt.c zlanhb.c - zlanhe.c - zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c - zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c - zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c - zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c - zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c - zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c - zlarfg.c zlarfgp.c zlarft.c - zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c - zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c - zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c - zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c - zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c - zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c - zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c - zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c - zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c - zrot.c zspcon.c zsprfs.c zspsv.c - zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c - zstegr.c zstein.c zsteqr.c zsycon.c - zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c - zsytri2.c zsytri2x.c zsyswapr.c - zsytrs.c zsytrs2.c - zsyconv.c zsyconvf.c zsyconvf_rook.c - zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c - zsytri_rook.c zsycon_rook.c zsysv_rook.c - zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c - zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c - ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c - ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c - ztprfs.c ztptri.c - ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c - ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c - zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c - zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c - zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c - zunmtr.c zupgtr.c - zupmtr.c izmax1.c dzsum1.c zstemr.c - zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c - zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c - ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c - zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c - zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c - zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c - zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c - ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c - ztplqt.c ztplqt2.c ztpmlqt.c - zgelqt.c zgelqt3.c zgemlqt.c - zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c - zgelq.c zlaswlq.c zlamswlq.c zgemlq.c - zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c - zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c - zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c - zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c) - -set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c - zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c - zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c - zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c - zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c - zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c - zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c - zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c - zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) - - -if(USE_XBLAS) - set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) -endif() - -list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c - DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c - DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) -list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c - DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c - DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) -list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c - DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c - DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) -list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c - DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c - DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) -message(STATUS "Building deprecated routines") - -set(DSLASRC spotrs.c) - -set(ZCLASRC cpotrs.c) - -set(SCATGEN slatm1.c slaran.c slarnd.c) - -set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c - slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c - slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c) - -set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c - clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c - clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c) - -set(DZATGEN dlatm1.c dlaran.c dlarnd.c) - -set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c - dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c - dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c) - -set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c - zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c - zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c) - -if(BUILD_SINGLE) - set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) - set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision") -endif() -if(BUILD_DOUBLE) - set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) - set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) - message(STATUS "Building Double Precision") -endif() -if(BUILD_COMPLEX) - set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision Complex") -endif() -if(BUILD_COMPLEX16) - set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) -# for zlange/zlanhe - if (NOT BUILD_DOUBLE) - set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c) - endif () - message(STATUS "Building Double Precision Complex") -endif() - -endif() - -# add lapack-netlib folder to the sources -set(LA_SOURCES "") -foreach (LA_FILE ${LA_REL_SRC}) - list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") -endforeach () -foreach (LA_FILE ${LA_GEN_SRC}) - list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}") -endforeach () - -if (NOT C_LAPACK) - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") - if (${F_COMPILER} STREQUAL "GFORTRAN") - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") - endif() -else () - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") -endif () +# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. +if (NOT C_LAPACK) + message (STATUS "fortran lapack") +set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90 + ../INSTALL/ilaver.f xerbla_array.f + ../INSTALL/slamch.f) + +set(SCLAUX + scombssq.f sbdsvdx.f sstevx.f sstein.f + la_constants.f90 + sbdsdc.f + sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f + slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f + slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f + slagts.f slamrg.f slanst.f + slapy2.f slapy3.f slarnv.f + slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f + slarrk.f slarrr.f slaneg.f + slartg.f90 slaruv.f slas2.f slascl.f + slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f + slasd7.f slasd8.f slasda.f slasdq.f slasdt.f + slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f + slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f + ssteqr.f ssterf.f slaisnan.f sisnan.f + slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f + ../INSTALL/second_${TIMER}.f) + +set(DZLAUX + la_constants.f90 + dbdsdc.f + dbdsvdx.f dstevx.f dstein.f + dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f + dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f + dlagts.f dlamrg.f dlanst.f + dlapy2.f dlapy3.f dlarnv.f + dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f + dlarrk.f dlarrr.f dlaneg.f + dlartg.f90 dlaruv.f dlas2.f dlascl.f + dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f + dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f + dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f + dsteqr.f dsterf.f dlaisnan.f disnan.f + dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f + ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) + +set(SLASRC + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f + sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f + sgehd2.f sgehrd.f sgelq2.f sgelqf.f + sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f + sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f + sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f + sgetrf2.f sgetri.f + sggbak.f sggbal.f + sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f + sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f + sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f + sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f + shsein.f shseqr.f slabrd.f slacon.f slacn2.f + slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f + slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f + slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f + slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f + slansy.f slantb.f slantp.f slantr.f slanv2.f + slapll.f slapmt.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f + slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f + slarrv.f slartv.f + slarz.f slarzb.f slarzt.f slasy2.f + slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f + slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f + sopgtr.f sopmtr.f sorg2l.f sorg2r.f + sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f + sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f + sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f + sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f + spbstf.f spbsv.f spbsvx.f + spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f + sposvx.f spotrf2.f spotri.f spstrf.f spstf2.f + sppcon.f sppequ.f + spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f + spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f + ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f + ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f + sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f + ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f + ssyswapr.f ssytrs.f ssytrs2.f + ssyconv.f ssyconvf.f ssyconvf_rook.f + ssysv_aa.f ssysv_aa_2stage.f ssytrf_aa.f ssytrf_aa_2stage.f ssytrs_aa.f ssytrs_aa_2stage.f + ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f + ssytri_rook.f ssycon_rook.f ssysv_rook.f + ssytf2_rk.f ssytrf_rk.f ssytrs_3.f + ssytri_3.f ssytri_3x.f ssycon_3.f ssysv_rk.f + ssysv_aa.f ssytrf_aa.f ssytrs_aa.f + stbcon.f + stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f + stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f + stptrs.f + strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f + strtrs.f stzrzf.f sstemr.f + slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f + stfttr.f stpttf.f stpttr.f strttf.f strttp.f + sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f + sgeequb.f ssyequb.f spoequb.f sgbequb.f + sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f + sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f + sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f + stpqrt.f stpqrt2.f stpmqrt.f stprfb.f + sgelqt.f sgelqt3.f sgemlqt.f + sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f + sgelq.f slaswlq.f slamswlq.f sgemlq.f + stplqt.f stplqt2.f stpmlqt.f + ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f + ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f + ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f + sgesvdq.f slaorhr_col_getrfnp.f + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f + slarmm.f slatrs3.f strsyl3.f) + +set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f + sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f + sla_syrfsx_extended.f sla_syamv.f sla_syrcond.f sla_syrpvgrw.f + sposvxx.f sporfsx.f sla_porfsx_extended.f sla_porcond.f + sla_porpvgrw.f sgbsvxx.f sgbrfsx.f sla_gbrfsx_extended.f + sla_gbamv.f sla_gbrcond.f sla_gbrpvgrw.f sla_lin_berr.f slarscl2.f + slascl2.f sla_wwaddw.f) + +set(CLASRC + cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f + cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f + cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f + cgehd2.f cgehrd.f cgelq2.f cgelqf.f + cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f + cgesc2.f cgesdd.f cgesvd.f cgesvdx.f + cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f + cgesvx.f cgetc2.f cgetrf2.f + cgetri.f + cggbak.f cggbal.f + cgges.f cgges3.f cggesx.f cggev.f cggev3.f cggevx.f + cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f + cggsvd3.f cggsvp3.f + cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f + chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f + checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f + chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f + chetf2.f chetrd.f + chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f + chetrs.f chetrs2.f + chetf2_rook.f chetrf_rook.f chetri_rook.f + chetrs_rook.f checon_rook.f chesv_rook.f + chetf2_rk.f chetrf_rk.f chetri_3.f chetri_3x.f + chetrs_3.f checon_3.f chesv_rk.f + chesv_aa.f chesv_aa_2stage.f chetrf_aa.f chetrf_aa_2stage.f chetrs_aa.f chetrs_aa_2stage.f + chgeqz.f chpcon.f chpev.f chpevd.f + chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f + chpsvx.f + chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f + clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f + claed0.f claed7.f claed8.f + claein.f claesy.f claev2.f clags2.f clagtm.f + clahef.f clahef_rook.f clahef_rk.f clahef_aa.f clahqr.f + clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f + clanhb.f clanhe.f + clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f + clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f + claqz0.f claqz1.f claqz2.f claqz3.f + claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f + clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f + clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f + clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 + clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f + clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f + cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f + cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f + cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f + cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f + cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f + crot.f cspcon.f csprfs.f cspsv.f + cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f + cstegr.f cstein.f csteqr.f csycon.f + csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f + csytri2.f csytri2x.f csyswapr.f + csytrs.f csytrs2.f + csyconv.f csyconvf.f csyconvf_rook.f + csytf2_rook.f csytrf_rook.f csytrs_rook.f + csytri_rook.f csycon_rook.f csysv_rook.f + csytf2_rk.f csytrf_rk.f csytrf_aa.f csytrf_aa_2stage.f csytrs_3.f csytrs_aa.f csytrs_aa_2stage.f + csytri_3.f csytri_3x.f csycon_3.f csysv_rk.f csysv_aa.f csysv_aa_2stage.f + ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f + ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f + ctprfs.f ctptri.f + ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f + ctrsyl.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f + cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f + cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f + cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f + cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f + chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f + ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f + cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f + cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f + cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f + cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f + ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f + cgelqt.f cgelqt3.f cgemlqt.f + cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f + cgelq.f claswlq.f clamswlq.f cgemlq.f + ctplqt.f ctplqt2.f ctpmlqt.f + chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f + cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f + chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f + cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f + cungtsqr.f cungtsqr_row.f cunhr_col.f + clatrs3.f ctrsyl3.f ) + +set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f + cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f + csysvxx.f csyrfsx.f cla_syrfsx_extended.f cla_syamv.f + cla_syrcond_c.f cla_syrcond_x.f cla_syrpvgrw.f + cposvxx.f cporfsx.f cla_porfsx_extended.f + cla_porcond_c.f cla_porcond_x.f cla_porpvgrw.f + cgbsvxx.f cgbrfsx.f cla_gbrfsx_extended.f cla_gbamv.f + cla_gbrcond_c.f cla_gbrcond_x.f cla_gbrpvgrw.f + chesvxx.f cherfsx.f cla_herfsx_extended.f cla_heamv.f + cla_hercond_c.f cla_hercond_x.f cla_herpvgrw.f + cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) + +set(DLASRC + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f + dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f + dgehd2.f dgehrd.f dgelq2.f dgelqf.f + dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f + dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f + dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f + dgetrf2.f dgetri.f + dggbak.f dggbal.f + dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f + dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f + dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f + dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f + dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f + dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f + dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f + dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f + dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f + dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f + dlapll.f dlapmt.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f + dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f + dlargv.f dlarrv.f dlartv.f + dlarz.f dlarzb.f dlarzt.f dlasy2.f + dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f + dopgtr.f dopmtr.f dorg2l.f dorg2r.f + dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f + dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f + dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f + dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f + dpbstf.f dpbsv.f dpbsvx.f + dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f + dposvx.f dpotrf2.f dpotri.f dpotrs.f dpstrf.f dpstf2.f + dppcon.f dppequ.f + dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f + dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f + dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f + dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f + dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f + dsycon.f dsyev.f dsyevd.f dsyevr.f + dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f + dsysv.f dsysvx.f + dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f + dsytri2.f dsytri2x.f dsyswapr.f + dsyconv.f dsyconvf.f dsyconvf_rook.f + dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f + dsytri_rook.f dsycon_rook.f dsysv_rook.f + dsytf2_rk.f dsytrf_rk.f dsytrs_3.f + dsytri_3.f dsytri_3x.f dsycon_3.f dsysv_rk.f + dsysv_aa.f dsysv_aa_2stage.f dsytrf_aa.f dsytrf_aa_2stage.f dsytrs_aa.f dsytrs_aa_2stage.f + dtbcon.f + dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f + dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f + dtptrs.f + dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f + dtrtrs.f dtzrzf.f dstemr.f + dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f + dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f + dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f + dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f + dgeequb.f dsyequb.f dpoequb.f dgbequb.f + dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f + dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f + dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f + dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f + dgelqt.f dgelqt3.f dgemlqt.f + dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f + dgelq.f dlaswlq.f dlamswlq.f dgemlq.f + dtplqt.f dtplqt2.f dtpmlqt.f + dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f + dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f + dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f + dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f + dlarmm.f dlatrs3.f dtrsyl3.f) + +set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f + dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f + dla_syrfsx_extended.f dla_syamv.f dla_syrcond.f dla_syrpvgrw.f + dposvxx.f dporfsx.f dla_porfsx_extended.f dla_porcond.f + dla_porpvgrw.f dgbsvxx.f dgbrfsx.f dla_gbrfsx_extended.f + dla_gbamv.f dla_gbrcond.f dla_gbrpvgrw.f dla_lin_berr.f dlarscl2.f + dlascl2.f dla_wwaddw.f) + +set(ZLASRC + zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f + zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f + zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f + zgehd2.f zgehrd.f zgelq2.f zgelqf.f + zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f + zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f + zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f + zgetc2.f zgetrf2.f + zgetri.f + zggbak.f zggbal.f + zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f + zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f + zggsvd3.f zggsvp3.f + zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f + zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f + zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f + zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f + zhetf2.f zhetrd.f + zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f + zhetrs.f zhetrs2.f + zhetf2_rook.f zhetrf_rook.f zhetri_rook.f + zhetrs_rook.f zhecon_rook.f zhesv_rook.f + zhetf2_rk.f zhetrf_rk.f zhetri_3.f zhetri_3x.f + zhetrs_3.f zhecon_3.f zhesv_rk.f + zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f + zhgeqz.f zhpcon.f zhpev.f zhpevd.f + zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f + zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f + zhpsvx.f + zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f + zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f + zlaed0.f zlaed7.f zlaed8.f + zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f + zlahef.f zlahef_rook.f zlahef_rk.f zlahef_aa.f zlahqr.f + zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f + zlangt.f zlanhb.f + zlanhe.f + zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f + zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f + zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f + zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f + zlarfg.f zlarfgp.f zlarft.f + zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f + zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f + zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f + zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f + zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f + zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f + zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f + zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f + zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f + zrot.f zspcon.f zsprfs.f zspsv.f + zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f + zstegr.f zstein.f zsteqr.f zsycon.f + zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f + zsytri2.f zsytri2x.f zsyswapr.f + zsytrs.f zsytrs2.f + zsyconv.f zsyconvf.f zsyconvf_rook.f + zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytrs_aa.f zsytrs_aa_2stage.f + zsytri_rook.f zsycon_rook.f zsysv_rook.f + zsytf2_rk.f zsytrf_rk.f zsytrf_aa.f zsytrf_aa_2stage.f zsytrs_3.f + zsytri_3.f zsytri_3x.f zsycon_3.f zsysv_rk.f zsysv_aa.f zsysv_aa_2stage.f + ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f + ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f + ztprfs.f ztptri.f + ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f + ztrsyl.f ztrtrs.f ztzrzf.f zung2l.f + zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f + zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f + zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f + zunmtr.f zupgtr.f + zupmtr.f izmax1.f dzsum1.f zstemr.f + zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f + zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f + ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f + zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f + zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f + zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f + zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f + ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f + ztplqt.f ztplqt2.f ztpmlqt.f + zgelqt.f zgelqt3.f zgemlqt.f + zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f + zgelq.f zlaswlq.f zlamswlq.f zgemlq.f + zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f + zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f + zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f + zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f + zungtsqr.f zungtsqr_row.f zunhr_col.f + zlatrs3.f ztrsyl3.f) + +set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f + zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f + zla_syrfsx_extended.f zla_syamv.f zla_syrcond_c.f zla_syrcond_x.f + zla_syrpvgrw.f zposvxx.f zporfsx.f zla_porfsx_extended.f + zla_porcond_c.f zla_porcond_x.f zla_porpvgrw.f zgbsvxx.f zgbrfsx.f + zla_gbrfsx_extended.f zla_gbamv.f zla_gbrcond_c.f zla_gbrcond_x.f + zla_gbrpvgrw.f zhesvxx.f zherfsx.f zla_herfsx_extended.f + zla_heamv.f zla_hercond_c.f zla_hercond_x.f zla_herpvgrw.f + zla_lin_berr.f zlarscl2.f zlascl2.f zla_wwaddw.f) + + +if(USE_XBLAS) + set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) +endif() + +list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f + DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f + DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) +list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f + DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f + DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f) +list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f + DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f + DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f) +list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f + DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f + DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) +message(STATUS "Building deprecated routines") + +set(DSLASRC spotrs.f) + +set(ZCLASRC cpotrs.f) + +set(SCATGEN slatm1.f slaran.f slarnd.f) + +set(SMATGEN slatms.f slatme.f slatmr.f slatmt.f + slagge.f slagsy.f slakf2.f slarge.f slaror.f slarot.f slatm2.f + slatm3.f slatm5.f slatm6.f slatm7.f slahilb.f) + +set(CMATGEN clatms.f clatme.f clatmr.f clatmt.f + clagge.f claghe.f clagsy.f clakf2.f clarge.f claror.f clarot.f + clatm1.f clarnd.f clatm2.f clatm3.f clatm5.f clatm6.f clahilb.f slatm7.f) + +set(DZATGEN dlatm1.f dlaran.f dlarnd.f) + +set(DMATGEN dlatms.f dlatme.f dlatmr.f dlatmt.f + dlagge.f dlagsy.f dlakf2.f dlarge.f dlaror.f dlarot.f dlatm2.f + dlatm3.f dlatm5.f dlatm6.f dlatm7.f dlahilb.f) + +set(ZMATGEN zlatms.f zlatme.f zlatmr.f zlatmt.f + zlagge.f zlaghe.f zlagsy.f zlakf2.f zlarge.f zlaror.f zlarot.f + zlatm1.f zlarnd.f zlatm2.f zlatm3.f zlatm5.f zlatm6.f zlahilb.f dlatm7.f) + +if(BUILD_SINGLE) + set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) + set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision") +endif() +if(BUILD_DOUBLE) + set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) + set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) + message(STATUS "Building Double Precision") +endif() +if(BUILD_COMPLEX) + set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision Complex") +endif() +if(BUILD_COMPLEX16) + set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) + endif () + message(STATUS "Building Double Precision Complex") +endif() + +else () + + message (STATUS "c lapack") +set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c + ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c + ../INSTALL/ilaver.c xerbla_array.c + ../INSTALL/slamch.c) + +set(SCLAUX + scombssq.c sbdsvdx.c sstevx.c sstein.c + sbdsdc.c + sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c + slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c + slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c + slagts.c slamrg.c slanst.c + slapy2.c slapy3.c slarnv.c + slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c + slarrk.c slarrr.c slaneg.c + slartg.c slaruv.c slas2.c slascl.c + slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c + slasd7.c slasd8.c slasda.c slasdq.c slasdt.c + slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c + slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c + ssteqr.c ssterf.c slaisnan.c sisnan.c + slartgp.c slartgs.c + ../INSTALL/second_${TIMER}.c) + +set(DZLAUX + dbdsdc.c + dbdsvdx.c dstevx.c dstein.c + dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c + dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c + dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c + dlagts.c dlamrg.c dlanst.c + dlapy2.c dlapy3.c dlarnv.c + dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c + dlarrk.c dlarrr.c dlaneg.c + dlartg.c dlaruv.c dlas2.c dlascl.c + dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c + dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c + dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c + dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c + dsteqr.c dsterf.c dlaisnan.c disnan.c + dlartgp.c dlartgs.c + ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c) + +set(SLASRC + sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c + sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c + sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c + sgehd2.c sgehrd.c sgelq2.c sgelqf.c + sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c + sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c + sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c + sgetrf2.c sgetri.c + sggbak.c sggbal.c + sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c + sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c + sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c + sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c + shsein.c shseqr.c slabrd.c slacon.c slacn2.c + slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c + slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c + slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c + slansy.c slantb.c slantp.c slantr.c slanv2.c + slapll.c slapmt.c + slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c + slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c + slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c + slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c + slarrv.c slartv.c + slarz.c slarzb.c slarzt.c slasy2.c + slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c + slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c + sopgtr.c sopmtr.c sorg2l.c sorg2r.c + sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c + sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c + sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c + sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c + spbstf.c spbsv.c spbsvx.c + spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c + sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c + sppcon.c sppequ.c + spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c + spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c + ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c + ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c + sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c + ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c + ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c + ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c + ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c + ssyswapr.c ssytrs.c ssytrs2.c + ssyconv.c ssyconvf.c ssyconvf_rook.c + ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c + ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c + ssytri_rook.c ssycon_rook.c ssysv_rook.c + ssytf2_rk.c ssytrf_rk.c ssytrs_3.c + ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c + ssysv_aa.c ssytrf_aa.c ssytrs_aa.c + stbcon.c + stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c + stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c + stptrs.c + strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c + strtrs.c stzrzf.c sstemr.c + slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c + stfttr.c stpttf.c stpttr.c strttf.c strttp.c + sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c + sgeequb.c ssyequb.c spoequb.c sgbequb.c + sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c + sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c + sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c + stpqrt.c stpqrt2.c stpmqrt.c stprfb.c + sgelqt.c sgelqt3.c sgemlqt.c + sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c + sgelq.c slaswlq.c slamswlq.c sgemlq.c + stplqt.c stplqt2.c stpmlqt.c + ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c + ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c + ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c + sgesvdq.c slaorhr_col_getrfnp.c + slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c + slarmm.c slatrs3.c strsyl3.c) + +set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c + sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c + sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c + sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c + sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c + sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c + slascl2.c sla_wwaddw.c) + +set(CLASRC + cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c + cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c + cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c + cgehd2.c cgehrd.c cgelq2.c cgelqf.c + cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c + cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c + cgesc2.c cgesdd.c cgesvd.c cgesvdx.c + cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c + cgesvx.c cgetc2.c cgetrf2.c + cgetri.c + cggbak.c cggbal.c + cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c + cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c + cggsvd3.c cggsvp3.c + cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c + chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c + checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c + chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c + chetf2.c chetrd.c + chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c + chetrs.c chetrs2.c + chetf2_rook.c chetrf_rook.c chetri_rook.c + chetrs_rook.c checon_rook.c chesv_rook.c + chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c + chetrs_3.c checon_3.c chesv_rk.c + chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c + chgeqz.c chpcon.c chpev.c chpevd.c + chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c + chpsvx.c + chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c + clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c + claed0.c claed7.c claed8.c + claein.c claesy.c claev2.c clags2.c clagtm.c + clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c + clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c + clanhb.c clanhe.c + clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c + clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c + claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c + claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c + claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c + clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c + clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c + clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c + clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c + clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c + cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c + cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c + cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c + cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c + cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c + crot.c cspcon.c csprfs.c cspsv.c + cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c + cstegr.c cstein.c csteqr.c csycon.c + csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c + csytri2.c csytri2x.c csyswapr.c + csytrs.c csytrs2.c + csyconv.c csyconvf.c csyconvf_rook.c + csytf2_rook.c csytrf_rook.c csytrs_rook.c + csytri_rook.c csycon_rook.c csysv_rook.c + csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c + csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c + ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c + ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c + ctprfs.c ctptri.c + ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c + ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c + cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c + cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c + cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c + cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c + chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c + ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c + cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c + cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c + cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c + cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c + ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c + cgelqt.c cgelqt3.c cgemlqt.c + cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c + cgelq.c claswlq.c clamswlq.c cgemlq.c + ctplqt.c ctplqt2.c ctpmlqt.c + chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c + cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c + chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c + cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c + cungtsqr.c cungtsqr_row.c cunhr_col.c + clatrs3.c ctrsyl3.c) + +set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c + cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c + csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c + cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c + cposvxx.c cporfsx.c cla_porfsx_extended.c + cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c + cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c + cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c + chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c + cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c + cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c) + +set(DLASRC + dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c + dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c + dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c + dgehd2.c dgehrd.c dgelq2.c dgelqf.c + dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c + dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c + dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c + dgetrf2.c dgetri.c + dggbak.c dggbal.c + dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c + dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c + dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c + dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c + dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c + dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c + dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c + dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c + dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c + dlapll.c dlapmt.c + dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c + dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c + dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c + dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c + dlargv.c dlarrv.c dlartv.c + dlarz.c dlarzb.c dlarzt.c dlasy2.c + dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c + dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c + dopgtr.c dopmtr.c dorg2l.c dorg2r.c + dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c + dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c + dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c + dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c + dpbstf.c dpbsv.c dpbsvx.c + dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c + dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c + dppcon.c dppequ.c + dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c + dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c + dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c + dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c + dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c + dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c + dsycon.c dsyev.c dsyevd.c dsyevr.c + dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c + dsysv.c dsysvx.c + dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c + dsytri2.c dsytri2x.c dsyswapr.c + dsyconv.c dsyconvf.c dsyconvf_rook.c + dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c + dsytri_rook.c dsycon_rook.c dsysv_rook.c + dsytf2_rk.c dsytrf_rk.c dsytrs_3.c + dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c + dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c + dtbcon.c + dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c + dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c + dtptrs.c + dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c + dtrtrs.c dtzrzf.c dstemr.c + dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c + dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c + dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c + dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c + dgeequb.c dsyequb.c dpoequb.c dgbequb.c + dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c + dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c + dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c + dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c + dgelqt.c dgelqt3.c dgemlqt.c + dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c + dgelq.c dlaswlq.c dlamswlq.c dgemlq.c + dtplqt.c dtplqt2.c dtpmlqt.c + dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c + dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c + dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c + dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c + dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c + dlarmm.c dlatrs3.c dtrsyl3.c) + +set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c + dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c + dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c + dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c + dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c + dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c + dlascl2.c dla_wwaddw.c) + +set(ZLASRC + zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c + zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c + zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c + zgehd2.c zgehrd.c zgelq2.c zgelqf.c + zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c + zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c + zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c + zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c + zgetc2.c zgetrf2.c + zgetri.c + zggbak.c zggbal.c + zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c + zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c + zggsvd3.c zggsvp3.c + zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c + zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c + zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c + zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c + zhetf2.c zhetrd.c + zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c + zhetrs.c zhetrs2.c + zhetf2_rook.c zhetrf_rook.c zhetri_rook.c + zhetrs_rook.c zhecon_rook.c zhesv_rook.c + zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c + zhetrs_3.c zhecon_3.c zhesv_rk.c + zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c + zhgeqz.c zhpcon.c zhpev.c zhpevd.c + zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c + zhpsvx.c + zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c + zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c + zlaed0.c zlaed7.c zlaed8.c + zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c + zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c + zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c + zlangt.c zlanhb.c + zlanhe.c + zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c + zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c + zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c + zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c + zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c + zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c + zlarfg.c zlarfgp.c zlarft.c + zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c + zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c + zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c + zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c + zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c + zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c + zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c + zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c + zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c + zrot.c zspcon.c zsprfs.c zspsv.c + zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c + zstegr.c zstein.c zsteqr.c zsycon.c + zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c + zsytri2.c zsytri2x.c zsyswapr.c + zsytrs.c zsytrs2.c + zsyconv.c zsyconvf.c zsyconvf_rook.c + zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c + zsytri_rook.c zsycon_rook.c zsysv_rook.c + zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c + zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c + ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c + ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c + ztprfs.c ztptri.c + ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c + ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c + zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c + zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c + zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c + zunmtr.c zupgtr.c + zupmtr.c izmax1.c dzsum1.c zstemr.c + zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c + zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c + ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c + zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c + zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c + zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c + zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c + ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c + ztplqt.c ztplqt2.c ztpmlqt.c + zgelqt.c zgelqt3.c zgemlqt.c + zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c + zgelq.c zlaswlq.c zlamswlq.c zgemlq.c + zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c + zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c + zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c + zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c) + +set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c + zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c + zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c + zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c + zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c + zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c + zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c + zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c + zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) + + +if(USE_XBLAS) + set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) +endif() + +list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c + DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c + DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) +list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c + DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c + DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) +list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c + DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c + DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) +list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c + DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c + DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) +message(STATUS "Building deprecated routines") + +set(DSLASRC spotrs.c) + +set(ZCLASRC cpotrs.c) + +set(SCATGEN slatm1.c slaran.c slarnd.c) + +set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c + slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c + slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c) + +set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c + clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c + clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c) + +set(DZATGEN dlatm1.c dlaran.c dlarnd.c) + +set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c + dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c + dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c) + +set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c + zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c + zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c) + +if(BUILD_SINGLE) + set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) + set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision") +endif() +if(BUILD_DOUBLE) + set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) + set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) + message(STATUS "Building Double Precision") +endif() +if(BUILD_COMPLEX) + set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision Complex") +endif() +if(BUILD_COMPLEX16) + set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c) + endif () + message(STATUS "Building Double Precision Complex") +endif() + +endif() + +# add lapack-netlib folder to the sources +set(LA_SOURCES "") +foreach (LA_FILE ${LA_REL_SRC}) + list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") +endforeach () +foreach (LA_FILE ${LA_GEN_SRC}) + list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}") +endforeach () + +if (NOT C_LAPACK) + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + if (${F_COMPILER} STREQUAL "GFORTRAN") + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") + endif() +else () + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") +endif () From 1688c7da439c377c0e7c8491c711655f1ff1c2ef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Nov 2022 22:24:01 +0100 Subject: [PATCH 100/154] change line endings from CRLF to LF --- kernel/arm64/sgemm_ncopy_4.S | 666 +- kernel/arm64/sgemm_tcopy_16.S | 1628 +- kernel/power/cgemm_kernel_power9.S | 586 +- kernel/power/cgemm_logic_power9.S | 5632 +++---- kernel/power/cgemm_macros_power9.S | 6036 +++---- kernel/power/cgemv_n.c | 1194 +- kernel/power/cgemv_t.c | 1202 +- kernel/power/crot.c | 466 +- kernel/power/dgemm_kernel_power9.S | 498 +- kernel/power/dgemm_logic_power9.S | 3962 ++--- kernel/power/dgemm_macros_power9.S | 7244 ++++---- kernel/power/icamax.c | 656 +- kernel/power/icamin.c | 532 +- kernel/power/isamax.c | 576 +- kernel/power/isamin.c | 576 +- kernel/power/sgemm_kernel_power9.S | 544 +- kernel/power/sgemm_logic_power9.S | 4382 ++--- kernel/power/sgemm_macros_power9.S | 11148 ++++++------- kernel/power/sgemv_n.c | 940 +- kernel/power/sgemv_n_8.c | 1028 +- kernel/power/sgemv_t.c | 968 +- kernel/power/sgemv_t_8.c | 1016 +- kernel/power/zgemm_kernel_power9.S | 488 +- kernel/power/zgemm_logic_power9.S | 3780 ++--- kernel/power/zgemm_macros_power9.S | 3648 ++--- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 13612 ++++++++-------- .../strsm_kernel_8x4_haswell_R_common.h | 452 +- kernel/x86_64/zgemm_kernel_2x2_bulldozer.S | 2808 ++-- kernel/x86_64/zgemm_kernel_2x2_piledriver.S | 2858 ++-- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 7762 ++++----- relapack/src/CMakeLists.txt | 172 +- 31 files changed, 43530 insertions(+), 43530 deletions(-) diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S index 30450cc7d..c819ee6fb 100644 --- a/kernel/arm64/sgemm_ncopy_4.S +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -1,333 +1,333 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A00 x2 -#define LDA x3 -#define B00 x4 - -#define A01 x5 -#define A02 x6 -#define A03 x7 -#define A04 x8 - -#define I x9 -#define J x10 - -#define TEMP1 x11 -#define TEMP2 x12 - -#define A_PREFETCH 2560 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -.macro COPY4x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - ldr q2, [A03], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - - ldr q3, [A04], #16 - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] - add B00, B00, #64 - -.endm - -.macro COPY1x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - ldr s2, [A03], #4 - ldr s3, [A04], #4 - - stp s0, s1, [B00] - add B00, B00, #8 - stp s2, s3, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] - add B00, B00, #32 -.endm - - -.macro COPY1x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - - stp s0, s1, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01], #16 - str q0, [B00], #16 -.endm - - -.macro COPY1x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01], #4 - str s0, [B00], #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - -.Ldgemm_ncopy_L4_BEGIN: - - asr J, N, #2 // J = N / 4 - cmp J, #0 - ble .Ldgemm_ncopy_L2_BEGIN - - .align 5 -.Ldgemm_ncopy_L4_M4_BEGIN: - - mov A01, A00 - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A00, A04, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_40 - - .align 5 -.Ldgemm_ncopy_L4_M4_20: - - COPY4x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_20 - -.Ldgemm_ncopy_L4_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_END - - .align 5 -.Ldgemm_ncopy_L4_M4_60: - - COPY1x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_60 - -.Ldgemm_ncopy_L4_M4_END: - - subs J , J, #1 // j-- - bne .Ldgemm_ncopy_L4_M4_BEGIN - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L2_BEGIN: - - tst N, #3 - ble .Ldgemm_ncopy_L999 - - tst N, #2 - ble .Ldgemm_ncopy_L1_BEGIN - -.Ldgemm_ncopy_L2_M4_BEGIN: - mov A01, A00 - add A02, A01, LDA - add A00, A02, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_40 - - .align 5 -.Ldgemm_ncopy_L2_M4_20: - - COPY4x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_20 - -.Ldgemm_ncopy_L2_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_END - - .align 5 -.Ldgemm_ncopy_L2_M4_60: - - COPY1x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_60 - -.Ldgemm_ncopy_L2_M4_END: - - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L1_BEGIN: - - tst N, #1 - ble .Ldgemm_ncopy_L999 - -.Ldgemm_ncopy_L1_M4_BEGIN: - - mov A01, A00 - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_40 - - .align 5 -.Ldgemm_ncopy_L1_M4_20: - - COPY4x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_20 - - -.Ldgemm_ncopy_L1_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_END - - .align 5 -.Ldgemm_ncopy_L1_M4_60: - - COPY1x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_60 - - -.Ldgemm_ncopy_L1_M4_END: - -.Ldgemm_ncopy_L999: - - mov x0, #0 - RESTORE_REGS - ret - - EPILOGUE - +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 + +#define I x9 +#define J x10 + +#define TEMP1 x11 +#define TEMP2 x12 + +#define A_PREFETCH 2560 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + + ldr q3, [A04], #16 + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] + add B00, B00, #64 + +.endm + +.macro COPY1x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ldr s2, [A03], #4 + ldr s3, [A04], #4 + + stp s0, s1, [B00] + add B00, B00, #8 + stp s2, s3, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] + add B00, B00, #32 +.endm + + +.macro COPY1x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + + stp s0, s1, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01], #16 + str q0, [B00], #16 +.endm + + +.macro COPY1x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Ldgemm_ncopy_L4_BEGIN: + + asr J, N, #2 // J = N / 4 + cmp J, #0 + ble .Ldgemm_ncopy_L2_BEGIN + + .align 5 +.Ldgemm_ncopy_L4_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_40 + + .align 5 +.Ldgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_20 + +.Ldgemm_ncopy_L4_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_END + + .align 5 +.Ldgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_60 + +.Ldgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne .Ldgemm_ncopy_L4_M4_BEGIN + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble .Ldgemm_ncopy_L999 + + tst N, #2 + ble .Ldgemm_ncopy_L1_BEGIN + +.Ldgemm_ncopy_L2_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_40 + + .align 5 +.Ldgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_20 + +.Ldgemm_ncopy_L2_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_END + + .align 5 +.Ldgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_60 + +.Ldgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Ldgemm_ncopy_L999 + +.Ldgemm_ncopy_L1_M4_BEGIN: + + mov A01, A00 + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_40 + + .align 5 +.Ldgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_20 + + +.Ldgemm_ncopy_L1_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_END + + .align 5 +.Ldgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_60 + + +.Ldgemm_ncopy_L1_M4_END: + +.Ldgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 431f1ae2a..3066421bb 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -1,814 +1,814 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A x2 -#define LDA x3 -#define B x4 - -#define M8 x5 - -#define A01 x6 -#define A02 x7 -#define A03 x8 -#define A04 x9 -#define A05 x10 -#define A06 x11 -#define A07 x12 -#define A08 x13 - -#define B01 x14 -#define B02 x15 -#define B03 x16 -#define B04 x17 -#define B00 x22 - - -#define I x21 -#define J x19 - -#define TEMP1 x20 - -#define A_PREFETCH 256 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - //prfm PSTL1KEEP, [B00, M8] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] - add A05, A05, #64 - - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] - add A06, A06, #64 - - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] - add A07, A07, #64 - - st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] - add A08, A08, #64 - - st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - add B00, B00, M8 - -.endm - -.macro COPY8x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 - - ldp q8, q9, [A05] - ldp q10, q11, [A06] - add A05, A05, #32 - add A06, A06, #32 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] - add B01, B01, #64 - - ldp q12, q13, [A07] - ldp q14, q15, [A08] - add A07, A07, #32 - add A08, A08, #32 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - add B02, B02, #64 - - ldr q4, [A05] - ldr q5, [A06] - ldr q6, [A07] - ldr q7, [A08] - - add A05, A05, #16 - add A06, A06, #16 - add A07, A07, #16 - add A08, A08, #16 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] - add B02, B02, #64 -.endm - -.macro COPY2x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - add B03, B03, #16 - - ldr d4, [A05] - ldr d5, [A06] - ldr d6, [A07] - ldr d7, [A08] - - add A05, A05, #8 - add A06, A06, #8 - add A07, A07, #8 - add A08, A08, #8 - - stp d4, d5, [B03] - add B03, B03, #16 - stp d6, d7, [B03] - add B03, B03, #16 - -.endm - -.macro COPY1x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - - ldr s4, [A05] - ldr s5, [A06] - ldr s6, [A07] - ldr s7, [A08] - - stp s4, s5, [B04] - add B04, B04, #8 - stp s6, s7, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ -.macro COPY16x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - - add B00, B00, M8 -.endm - -.macro COPY8x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - - add B02, B02, #64 -.endm - -.macro COPY2x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - - add B03, B03, #16 -.endm - -.macro COPY1x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add B00, B00, M8 -.endm - -.macro COPY8x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s}, [A01] - ld1 {v2.4s, v3.4s}, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - add A01, A01, #16 - add A02, A02, #16 - - stp q0, q1, [B02] - add B02, B02, #32 -.endm - -.macro COPY2x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - - add A01, A01, #8 - add A02, A02, #8 - - stp d0, d1, [B03] - add B03, B03, #16 -.endm - -.macro COPY1x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - - add A01, A01, #4 - add A02, A02, #4 - - stp s0, s1, [B04] - - add B04, B04, #8 -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add B00, B00, M8 -.endm - -.macro COPY8x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldp q0, q1, [A01] - add A01, A01, #32 - stp q0, q1, [B01] - - add B01, B01, #32 -.endm - -.macro COPY4x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01] - add A01, A01, #16 - str q0, [B02] - - add B02, B02, #16 -.endm - -.macro COPY2x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr d0, [A01] - add A01, A01, #8 - str d0, [B03] - - add B03, B03, #8 -.endm - -.macro COPY1x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01] - add A01, A01, #4 - str s0, [B04] - - add B04, B04, #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - - lsl TEMP1, M, #2 // TEMP1 = M * SIZE - - and B01 , N , #-16 - and B02 , N , #-8 - and B03 , N , #-4 - and B04 , N , #-2 - - mul B01, B01, TEMP1 - mul B02, B02, TEMP1 - mul B03, B03, TEMP1 - mul B04, B04, TEMP1 - - add B01 , B01, B - add B02 , B02, B - add B03 , B03, B - add B04 , B04, B - - lsl M8, M, #6 // M8 = M * 16 * SIZE - -.Lsgemm_tcopy_L8_BEGIN: - asr J, M, #3 // J = M / 8 - cmp J, #0 - ble .Lsgemm_tcopy_L4_BEGIN - - .align 5 -.Lsgemm_tcopy_L8_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A05, A04, LDA - add A06, A05, LDA - add A07, A06, LDA - add A08, A07, LDA - add A, A08, LDA - - mov B00, B - add B, B00, #512 // B = B + 8 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L8_M16_40 - - .align 5 -.Lsgemm_tcopy_L8_M16_20: - - COPY16x8 - - subs I , I , #1 - bne .Lsgemm_tcopy_L8_M16_20 - -.Lsgemm_tcopy_L8_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L8_M16_60 - - COPY8x8 - -.Lsgemm_tcopy_L8_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L8_M16_80 - - COPY4x8 - -.Lsgemm_tcopy_L8_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L8_M16_100 - - COPY2x8 - -.Lsgemm_tcopy_L8_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L8_M16_END - - COPY1x8 - -.Lsgemm_tcopy_L8_M16_END: - - subs J , J, #1 // j-- - bne .Lsgemm_tcopy_L8_M16_BEGIN - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L4_BEGIN: - tst M, #7 - ble .Lsgemm_tcopy_L999 - - tst M, #4 - ble .Lsgemm_tcopy_L2_BEGIN - -.Lsgemm_tcopy_L4_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A, A04, LDA - - mov B00, B - add B, B00, #256 // B = B + 4 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L4_M16_40 - - .align 5 -.Lsgemm_tcopy_L4_M16_20: - - COPY16x4 - - subs I , I , #1 - bne .Lsgemm_tcopy_L4_M16_20 - -.Lsgemm_tcopy_L4_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L4_M16_60 - - COPY8x4 - -.Lsgemm_tcopy_L4_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L4_M16_80 - - COPY4x4 - -.Lsgemm_tcopy_L4_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L4_M16_100 - - COPY2x4 - - -.Lsgemm_tcopy_L4_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L4_M16_END - - COPY1x4 - - -.Lsgemm_tcopy_L4_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L2_BEGIN: - - tst M, #3 - ble .Lsgemm_tcopy_L999 - - tst M, #2 - ble .Lsgemm_tcopy_L1_BEGIN - -.Lsgemm_tcopy_L2_M16_BEGIN: - mov A01, A - add A02, A01, LDA - add A, A02, LDA - - mov B00, B - add B, B00, #128 // B = B + 2 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L2_M16_40 - - .align 5 -.Lsgemm_tcopy_L2_M16_20: - - COPY16x2 - - subs I , I , #1 - bne .Lsgemm_tcopy_L2_M16_20 - -.Lsgemm_tcopy_L2_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L2_M16_60 - - COPY8x2 - -.Lsgemm_tcopy_L2_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L2_M16_80 - - COPY4x2 - -.Lsgemm_tcopy_L2_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L2_M16_100 - - COPY2x2 - -.Lsgemm_tcopy_L2_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L2_M16_END - - COPY1x2 - -.Lsgemm_tcopy_L2_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L1_BEGIN: - - tst M, #1 - ble .Lsgemm_tcopy_L999 - - -.Lsgemm_tcopy_L1_M16_BEGIN: - - mov A01, A // A01 = A - mov B00, B - - asr I, N, #4 // I = M / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L1_M16_40 - - .align 5 -.Lsgemm_tcopy_L1_M16_20: - - COPY16x1 - - subs I , I , #1 - bne .Lsgemm_tcopy_L1_M16_20 - -.Lsgemm_tcopy_L1_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L1_M16_60 - - COPY8x1 - -.Lsgemm_tcopy_L1_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L1_M16_80 - - COPY4x1 - -.Lsgemm_tcopy_L1_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L1_M16_100 - - COPY2x1 - -.Lsgemm_tcopy_L1_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L1_M16_END - - COPY1x1 - - -.Lsgemm_tcopy_L1_M16_END: - -.Lsgemm_tcopy_L999: - mov x0, #0 // set return value - RESTORE_REGS - ret - - EPILOGUE - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x21 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + //prfm PSTL1KEEP, [B00, M8] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] + add A05, A05, #64 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] + add A06, A06, #64 + + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] + add A07, A07, #64 + + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] + add A08, A08, #64 + + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 + +.endm + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] + add B01, B01, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + add B02, B02, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] + add B02, B02, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + add B03, B03, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B03] + add B03, B03, #16 + stp d6, d7, [B03] + add B03, B03, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + stp s4, s5, [B04] + add B04, B04, #8 + stp s6, s7, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ +.macro COPY16x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + + add B00, B00, M8 +.endm + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + + add B02, B02, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + + add B03, B03, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add B00, B00, M8 +.endm + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B02] + add B02, B02, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B03] + add B03, B03, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B04] + + add B04, B04, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B01] + + add B01, B01, #32 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B02] + + add B02, B02, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B03] + + add B03, B03, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B04] + + add B04, B04, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-16 + and B02 , N , #-8 + and B03 , N , #-4 + and B04 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + mul B04, B04, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + add B04 , B04, B + + lsl M8, M, #6 // M8 = M * 16 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #512 // B = B + 8 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M16_40 + + .align 5 +.Lsgemm_tcopy_L8_M16_20: + + COPY16x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M16_20 + +.Lsgemm_tcopy_L8_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L8_M16_60 + + COPY8x8 + +.Lsgemm_tcopy_L8_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L8_M16_80 + + COPY4x8 + +.Lsgemm_tcopy_L8_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M16_100 + + COPY2x8 + +.Lsgemm_tcopy_L8_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M16_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M16_END: + + subs J , J, #1 // j-- + bne .Lsgemm_tcopy_L8_M16_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #256 // B = B + 4 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M16_40 + + .align 5 +.Lsgemm_tcopy_L4_M16_20: + + COPY16x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M16_20 + +.Lsgemm_tcopy_L4_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L4_M16_60 + + COPY8x4 + +.Lsgemm_tcopy_L4_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L4_M16_80 + + COPY4x4 + +.Lsgemm_tcopy_L4_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M16_100 + + COPY2x4 + + +.Lsgemm_tcopy_L4_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L4_M16_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #128 // B = B + 2 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M16_40 + + .align 5 +.Lsgemm_tcopy_L2_M16_20: + + COPY16x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M16_20 + +.Lsgemm_tcopy_L2_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L2_M16_60 + + COPY8x2 + +.Lsgemm_tcopy_L2_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L2_M16_80 + + COPY4x2 + +.Lsgemm_tcopy_L2_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M16_100 + + COPY2x2 + +.Lsgemm_tcopy_L2_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M16_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #4 // I = M / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M16_40 + + .align 5 +.Lsgemm_tcopy_L1_M16_20: + + COPY16x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M16_20 + +.Lsgemm_tcopy_L1_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L1_M16_60 + + COPY8x1 + +.Lsgemm_tcopy_L1_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L1_M16_80 + + COPY4x1 + +.Lsgemm_tcopy_L1_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M16_100 + + COPY2x1 + +.Lsgemm_tcopy_L1_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M16_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M16_END: + +.Lsgemm_tcopy_L999: + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE + + diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S index 4b5c2fa31..dfe17f3ef 100644 --- a/kernel/power/cgemm_kernel_power9.S +++ b/kernel/power/cgemm_kernel_power9.S @@ -1,293 +1,293 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - -#define alpha_r vs19 -#define alpha_i vs20 -#define save_permute_1 vs21 -#define permute_mask vs22 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define PRE r29 - -#define T12 r30 -#define T13 r31 - -#include "cgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_12, 0x0c0d0e0f1c1d1e1f -.equ save_permute_11, 0x0405060714151617 - - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - - - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) - - - -#ifdef TRMMKERNEL - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) -#endif - slwi LDC, LDC, ZBASE_SHIFT - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xscvdpspn alpha_i,vs2 - xxspltw alpha_r,alpha_r,0 - xxspltw alpha_i,alpha_i,0 -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - - - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - - - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - - - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - - - li r0,0 - li PRE,512 - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegsp alpha_r,alpha_r - xvnegsp alpha_i,alpha_i -#endif - - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - - /*mask is reverse permute so we have to make it inner permute */ - xxpermdi permute_mask, permute_mask, permute_mask,2 - -#include "cgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs19 +#define alpha_i vs20 +#define save_permute_1 vs21 +#define permute_mask vs22 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S index b4f937e90..a191219fa 100644 --- a/kernel/power/cgemm_logic_power9.S +++ b/kernel/power/cgemm_logic_power9.S @@ -1,2816 +1,2816 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define MY_ALIGN .align 3 -b CGEMM_L4 -/* MINI SUBROUTINES */ -/* 4x8 MAIN 128x+2 LOOP */ - - -CGEMM_L4x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x8_2 - MY_ALIGN -CGEMM_L4x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 -CGEMM_L4x8_K128: -/*----------------------------------------*/ - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_L2 128,64,31,0 - KERNEL4x8_L2 128,64,32,0 - KERNEL4x8_L2 128,64,33,0 - KERNEL4x8_L2 128,64,34,0 - KERNEL4x8_L2 128,64,35,0 - KERNEL4x8_L2 128,64,36,0 - KERNEL4x8_L2 128,64,37,0 - KERNEL4x8_L2 128,64,38,0 - KERNEL4x8_L2 128,64,39,0 - KERNEL4x8_L2 128,64,40,0 - KERNEL4x8_L2 128,64,41,0 - KERNEL4x8_L2 128,64,42,0 - KERNEL4x8_L2 128,64,43,0 - KERNEL4x8_L2 128,64,44,0 - KERNEL4x8_L2 128,64,45,0 - KERNEL4x8_L2 128,64,46,0 - KERNEL4x8_L2 128,64,47,0 - KERNEL4x8_L2 128,64,48,0 - KERNEL4x8_L2 128,64,49,0 - KERNEL4x8_L2 128,64,50,0 - KERNEL4x8_L2 128,64,51,0 - KERNEL4x8_L2 128,64,52,0 - KERNEL4x8_L2 128,64,53,0 - KERNEL4x8_L2 128,64,54,0 - KERNEL4x8_L2 128,64,55,0 - KERNEL4x8_L2 128,64,56,0 - KERNEL4x8_L2 128,64,57,0 - KERNEL4x8_L2 128,64,58,0 - KERNEL4x8_L2 128,64,59,0 - KERNEL4x8_L2 128,64,60,0 - KERNEL4x8_L2 128,64,61,0 - KERNEL4x8_L2 128,64,62,0 - KERNEL4x8_L2 128,64,63,1 - bdnz CGEMM_L4x8_LOOP - MY_ALIGN -CGEMM_L4x8_LOOP_END: -/*----------------------------------------*/ - END4x8_2 - blr - MY_ALIGN - - -CGEMM_4x8_L64_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_E2 128,64,31,1 - blr - MY_ALIGN - - -CGEMM_4x8_L32_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_E2 128,64,15,1 - blr - MY_ALIGN - - -CGEMM_4x8_L16_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_E2 128,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x4_2 - MY_ALIGN -CGEMM_L4x4_LOOP: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,0,0 -CGEMM_L4x4_K32: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_L2 64,64,7,0 - KERNEL4x4_L2 64,64,8,0 - KERNEL4x4_L2 64,64,9,0 - KERNEL4x4_L2 64,64,10,0 - KERNEL4x4_L2 64,64,11,0 - KERNEL4x4_L2 64,64,12,0 - KERNEL4x4_L2 64,64,13,0 - KERNEL4x4_L2 64,64,14,0 - KERNEL4x4_L2 64,64,15,1 - bdnz CGEMM_L4x4_LOOP - MY_ALIGN -CGEMM_L4x4_LOOP_END: -/*----------------------------------------*/ - END4x4_2 - blr - MY_ALIGN - - -CGEMM_4x4_L16_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_E2 64,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_L8_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_E2 64,64,3,1 - blr - - -CGEMM_4x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x2_2 - MY_ALIGN -CGEMM_L4x2_LOOP: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,0,0 -CGEMM_L4x2_K32: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_L2 32,64,7,0 - KERNEL4x2_L2 32,64,8,0 - KERNEL4x2_L2 32,64,9,0 - KERNEL4x2_L2 32,64,10,0 - KERNEL4x2_L2 32,64,11,0 - KERNEL4x2_L2 32,64,12,0 - KERNEL4x2_L2 32,64,13,0 - KERNEL4x2_L2 32,64,14,0 - KERNEL4x2_L2 32,64,15,1 - bdnz CGEMM_L4x2_LOOP - MY_ALIGN - - -CGEMM_L4x2_LOOP_END: -/*----------------------------------------*/ - END4x2_2 - blr - MY_ALIGN -CGEMM_4x2_L16_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_E2 32,64,7,1 - blr - MY_ALIGN -CGEMM_4x2_L8_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_E2 32,64,3,1 - blr - - -CGEMM_4x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x1_2 - MY_ALIGN -CGEMM_L4x1_LOOP: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,0,0 -CGEMM_L4x1_K32: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_L2 16,64,7,0 - KERNEL4x1_L2 16,64,8,0 - KERNEL4x1_L2 16,64,9,0 - KERNEL4x1_L2 16,64,10,0 - KERNEL4x1_L2 16,64,11,0 - KERNEL4x1_L2 16,64,12,0 - KERNEL4x1_L2 16,64,13,0 - KERNEL4x1_L2 16,64,14,0 - KERNEL4x1_L2 16,64,15,1 - bdnz CGEMM_L4x1_LOOP - MY_ALIGN -CGEMM_L4x1_LOOP_END: -/*----------------------------------------*/ - END4x1_2 - blr - - MY_ALIGN -CGEMM_4x1_L16_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_E2 16,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x1_L8_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_E2 16,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L4: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 2 - ble CGEMM_L4_END - - -CGEMM_L4_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 2 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L4x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L4x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO4x8 - ble CGEMM_L4x8_SUB0 - bl CGEMM_L4x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L4x8_SAVE - b CGEMM_L4x8_SUB2 - - -CGEMM_L4x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP4x8_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD4x8O 64,32 - END4x8_WITHOUT_ADD - LOAD4x8_2O 128, 64 - mtctr T8 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - CMP4x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L4x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD4x8_2O 128,64 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - MY_ALIGN - - -CGEMM_L4x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L4x8_SUB2_32 - bl CGEMM_4x8_L64_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L4x8_SUB2_16 - bl CGEMM_4x8_L32_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x8_SUB2_8 - bl CGEMM_4x8_L16_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x8_SUB2_4 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_L2 128,64, 1,0 - KERNEL4x8_L2 128,64, 2,0 - KERNEL4x8_E2 128,64, 3,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x8_SUB2_2 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_E2 128,64, 1,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x8_SUB2_1 - LOAD4x8_2 - KERNEL4x8_E2 128,64, 0,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x8_SAVE - KERNEL4x8 - - MY_ALIGN -CGEMM_L4x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 -#endif - bgt CGEMM_L4x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END - b CGEMM_L4x4_BEGIN - MY_ALIGN - - -CGEMM_L4x8_END: -/*----------------------------------------*/ - - -CGEMM_L4x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x4 - ble CGEMM_L4x4_SUB0 - bl CGEMM_4x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x4_SAVE - b CGEMM_L4x4_SUB2 - - -CGEMM_L4x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x4_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD4x4O 32,32 - END4x4_WITHOUT_ADD - LOAD4x4_2O 64, 64 - mtctr T8 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - CMP4x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD4x4_2O 64,64 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x4_SUB2_8 - bl CGEMM_4x4_L16_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x4_SUB2_4 - bl CGEMM_4x4_L8_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x4_SUB2_2 - LOAD4x4_2 - KERNEL4x4_L2 64,64, 0,0 - KERNEL4x4_E2 64,64, 1,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x4_SUB2_1 - LOAD4x4_2 - KERNEL4x4_E2 64,64, 0,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x4_SAVE - KERNEL4x4 - - -CGEMM_L4x4_SAVE: -/*----------------------------------------*/ - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 -#endif - - -CGEMM_L4x4_END: -/*----------------------------------------*/ - - -CGEMM_L4x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x2 - ble CGEMM_L4x2_SUB0 - bl CGEMM_4x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x2_SAVE - b CGEMM_L4x2_SUB2 - - -CGEMM_L4x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x2_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD4x2O 16,32 - END4x2_WITHOUT_ADD - LOAD4x2_2O 32, 64 - mtctr T8 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - CMP4x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD4x2_2O 32,64 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x2_SUB2_8 - bl CGEMM_4x2_L16_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x2_SUB2_4 - bl CGEMM_4x2_L8_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x2_SUB2_2 - LOAD4x2_2 - KERNEL4x2_L2 32,64, 0,0 - KERNEL4x2_E2 32,64, 1,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x2_SUB2_1 - LOAD4x2_2 - KERNEL4x2_E2 32,64, 0,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -CGEMM_L4x2_SAVE: -/*----------------------------------------*/ - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 -#endif - - -CGEMM_L4x2_END: -/*----------------------------------------*/ - - -CGEMM_L4x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x1 - ble CGEMM_L4x1_SUB0 - bl CGEMM_4x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x1_SAVE - b CGEMM_L4x1_SUB2 - - -CGEMM_L4x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x1_32K - addi BO,BO,-32 - addi AO,AO,-8 - LOAD4x1O 8,32 - END4x1_WITHOUT_ADD - LOAD4x1_2O 16, 64 - mtctr T8 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - CMP4x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-16 - LOAD4x1_2O 16,64 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x1_SUB2_8 - bl CGEMM_4x1_L16_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x1_SUB2_4 - bl CGEMM_4x1_L8_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x1_SUB2_2 - LOAD4x1_2 - KERNEL4x1_L2 16,64, 0,0 - KERNEL4x1_E2 16,64, 1,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x1_SUB2_1 - LOAD4x1_2 - KERNEL4x1_E2 16,64, 0,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -CGEMM_L4x1_SAVE: -/*----------------------------------------*/ - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 -#endif - - -CGEMM_L4x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - bgt CGEMM_L4_BEGIN - - -CGEMM_L4_END: - -b CGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -CGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -CGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 -CGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_L2 128,32,31,0 - KERNEL2x8_L2 128,32,32,0 - KERNEL2x8_L2 128,32,33,0 - KERNEL2x8_L2 128,32,34,0 - KERNEL2x8_L2 128,32,35,0 - KERNEL2x8_L2 128,32,36,0 - KERNEL2x8_L2 128,32,37,0 - KERNEL2x8_L2 128,32,38,0 - KERNEL2x8_L2 128,32,39,0 - KERNEL2x8_L2 128,32,40,0 - KERNEL2x8_L2 128,32,41,0 - KERNEL2x8_L2 128,32,42,0 - KERNEL2x8_L2 128,32,43,0 - KERNEL2x8_L2 128,32,44,0 - KERNEL2x8_L2 128,32,45,0 - KERNEL2x8_L2 128,32,46,0 - KERNEL2x8_L2 128,32,47,0 - KERNEL2x8_L2 128,32,48,0 - KERNEL2x8_L2 128,32,49,0 - KERNEL2x8_L2 128,32,50,0 - KERNEL2x8_L2 128,32,51,0 - KERNEL2x8_L2 128,32,52,0 - KERNEL2x8_L2 128,32,53,0 - KERNEL2x8_L2 128,32,54,0 - KERNEL2x8_L2 128,32,55,0 - KERNEL2x8_L2 128,32,56,0 - KERNEL2x8_L2 128,32,57,0 - KERNEL2x8_L2 128,32,58,0 - KERNEL2x8_L2 128,32,59,0 - KERNEL2x8_L2 128,32,60,0 - KERNEL2x8_L2 128,32,61,0 - KERNEL2x8_L2 128,32,62,0 - KERNEL2x8_L2 128,32,63,1 - bdnz CGEMM_L2x8_LOOP - MY_ALIGN -CGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -CGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_E2 128,32,31,1 - blr - MY_ALIGN - - -CGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_E2 128,32,15,1 - blr - MY_ALIGN - - -CGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_E2 128,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -CGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,0,0 -CGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_L2 64,32,7,0 - KERNEL2x4_L2 64,32,8,0 - KERNEL2x4_L2 64,32,9,0 - KERNEL2x4_L2 64,32,10,0 - KERNEL2x4_L2 64,32,11,0 - KERNEL2x4_L2 64,32,12,0 - KERNEL2x4_L2 64,32,13,0 - KERNEL2x4_L2 64,32,14,0 - KERNEL2x4_L2 64,32,15,1 - bdnz CGEMM_L2x4_LOOP - MY_ALIGN -CGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -CGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_E2 64,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_E2 64,32,3,1 - blr - - -CGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -CGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,0,0 -CGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_L2 32,32,7,0 - KERNEL2x2_L2 32,32,8,0 - KERNEL2x2_L2 32,32,9,0 - KERNEL2x2_L2 32,32,10,0 - KERNEL2x2_L2 32,32,11,0 - KERNEL2x2_L2 32,32,12,0 - KERNEL2x2_L2 32,32,13,0 - KERNEL2x2_L2 32,32,14,0 - KERNEL2x2_L2 32,32,15,1 - bdnz CGEMM_L2x2_LOOP - MY_ALIGN - - -CGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -CGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_E2 32,32,7,1 - blr - MY_ALIGN -CGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_E2 32,32,3,1 - blr - - -CGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -CGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,0,0 -CGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_L2 16,32,7,0 - KERNEL2x1_L2 16,32,8,0 - KERNEL2x1_L2 16,32,9,0 - KERNEL2x1_L2 16,32,10,0 - KERNEL2x1_L2 16,32,11,0 - KERNEL2x1_L2 16,32,12,0 - KERNEL2x1_L2 16,32,13,0 - KERNEL2x1_L2 16,32,14,0 - KERNEL2x1_L2 16,32,15,1 - bdnz CGEMM_L2x1_LOOP - MY_ALIGN -CGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -CGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_E2 16,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_E2 16,32,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L2: -/*----------------------------------------*/ - - andi. J, N, 2 - ble CGEMM_L2_END - - -CGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble CGEMM_L2x8_SUB0 - bl CGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L2x8_SAVE - b CGEMM_L2x8_SUB2 - - -CGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD2x8O 64,16 - END2x8_WITHOUT_ADD - LOAD2x8_2O 128, 32 - mtctr T8 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8_2O 128,32 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - MY_ALIGN - - -CGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L2x8_SUB2_32 - bl CGEMM_2x8_L64_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L2x8_SUB2_16 - bl CGEMM_2x8_L32_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x8_SUB2_8 - bl CGEMM_2x8_L16_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_L2 128,32, 1,0 - KERNEL2x8_L2 128,32, 2,0 - KERNEL2x8_E2 128,32, 3,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_E2 128,32, 1,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 128,32, 0,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -CGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt CGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END - b CGEMM_L2x4_BEGIN - MY_ALIGN - - -CGEMM_L2x8_END: -/*----------------------------------------*/ - - -CGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble CGEMM_L2x4_SUB0 - bl CGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x4_SAVE - b CGEMM_L2x4_SUB2 - - -CGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD2x4O 32,16 - END2x4_WITHOUT_ADD - LOAD2x4_2O 64, 32 - mtctr T8 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4_2O 64,32 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x4_SUB2_8 - bl CGEMM_2x4_L16_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x4_SUB2_4 - bl CGEMM_2x4_L8_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 64,32, 0,0 - KERNEL2x4_E2 64,32, 1,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 64,32, 0,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x4_SAVE - KERNEL2x4 - - -CGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -CGEMM_L2x4_END: -/*----------------------------------------*/ - - -CGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble CGEMM_L2x2_SUB0 - bl CGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x2_SAVE - b CGEMM_L2x2_SUB2 - - -CGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD2x2O 16,16 - END2x2_WITHOUT_ADD - LOAD2x2_2O 32, 32 - mtctr T8 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2_2O 32,32 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x2_SUB2_8 - bl CGEMM_2x2_L16_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x2_SUB2_4 - bl CGEMM_2x2_L8_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 32,32, 0,0 - KERNEL2x2_E2 32,32, 1,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 32,32, 0,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -CGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -CGEMM_L2x2_END: -/*----------------------------------------*/ - - -CGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble CGEMM_L2x1_SUB0 - bl CGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x1_SAVE - b CGEMM_L2x1_SUB2 - - -CGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-16 - addi AO,AO,-8 - LOAD2x1O 8,16 - END2x1_WITHOUT_ADD - LOAD2x1_2O 16, 32 - mtctr T8 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1_2O 16,32 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x1_SUB2_8 - bl CGEMM_2x1_L16_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x1_SUB2_4 - bl CGEMM_2x1_L8_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 16,32, 0,0 - KERNEL2x1_E2 16,32, 1,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 16,32, 0,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -CGEMM_L2x1_SAVE: -/*----------------------------------------*/ - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -CGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 4 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - -CGEMM_L2_END: - - -b CGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -CGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -CGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 -CGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_L2 128,16,31,0 - KERNEL1x8_L2 128,16,32,0 - KERNEL1x8_L2 128,16,33,0 - KERNEL1x8_L2 128,16,34,0 - KERNEL1x8_L2 128,16,35,0 - KERNEL1x8_L2 128,16,36,0 - KERNEL1x8_L2 128,16,37,0 - KERNEL1x8_L2 128,16,38,0 - KERNEL1x8_L2 128,16,39,0 - KERNEL1x8_L2 128,16,40,0 - KERNEL1x8_L2 128,16,41,0 - KERNEL1x8_L2 128,16,42,0 - KERNEL1x8_L2 128,16,43,0 - KERNEL1x8_L2 128,16,44,0 - KERNEL1x8_L2 128,16,45,0 - KERNEL1x8_L2 128,16,46,0 - KERNEL1x8_L2 128,16,47,0 - KERNEL1x8_L2 128,16,48,0 - KERNEL1x8_L2 128,16,49,0 - KERNEL1x8_L2 128,16,50,0 - KERNEL1x8_L2 128,16,51,0 - KERNEL1x8_L2 128,16,52,0 - KERNEL1x8_L2 128,16,53,0 - KERNEL1x8_L2 128,16,54,0 - KERNEL1x8_L2 128,16,55,0 - KERNEL1x8_L2 128,16,56,0 - KERNEL1x8_L2 128,16,57,0 - KERNEL1x8_L2 128,16,58,0 - KERNEL1x8_L2 128,16,59,0 - KERNEL1x8_L2 128,16,60,0 - KERNEL1x8_L2 128,16,61,0 - KERNEL1x8_L2 128,16,62,0 - KERNEL1x8_L2 128,16,63,1 - bdnz CGEMM_L1x8_LOOP - MY_ALIGN -CGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -CGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_E2 128,16,31,1 - blr - MY_ALIGN - - -CGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_E2 128,16,15,1 - blr - MY_ALIGN - - -CGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_E2 128,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN -CGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,0,0 -CGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_L2 64,16,7,0 - KERNEL1x4_L2 64,16,8,0 - KERNEL1x4_L2 64,16,9,0 - KERNEL1x4_L2 64,16,10,0 - KERNEL1x4_L2 64,16,11,0 - KERNEL1x4_L2 64,16,12,0 - KERNEL1x4_L2 64,16,13,0 - KERNEL1x4_L2 64,16,14,0 - KERNEL1x4_L2 64,16,15,1 - bdnz CGEMM_L1x4_LOOP - MY_ALIGN -CGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -CGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_E2 64,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_E2 64,16,3,1 - blr - - -CGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN -CGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,0,0 -CGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_L2 32,16,7,0 - KERNEL1x2_L2 32,16,8,0 - KERNEL1x2_L2 32,16,9,0 - KERNEL1x2_L2 32,16,10,0 - KERNEL1x2_L2 32,16,11,0 - KERNEL1x2_L2 32,16,12,0 - KERNEL1x2_L2 32,16,13,0 - KERNEL1x2_L2 32,16,14,0 - KERNEL1x2_L2 32,16,15,1 - bdnz CGEMM_L1x2_LOOP - MY_ALIGN - - -CGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN -CGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_E2 32,16,7,1 - blr - MY_ALIGN -CGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_E2 32,16,3,1 - blr - - -CGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN -CGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,0,0 -CGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_L2 16,16,7,0 - KERNEL1x1_L2 16,16,8,0 - KERNEL1x1_L2 16,16,9,0 - KERNEL1x1_L2 16,16,10,0 - KERNEL1x1_L2 16,16,11,0 - KERNEL1x1_L2 16,16,12,0 - KERNEL1x1_L2 16,16,13,0 - KERNEL1x1_L2 16,16,14,0 - KERNEL1x1_L2 16,16,15,1 - bdnz CGEMM_L1x1_LOOP - MY_ALIGN -CGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - - MY_ALIGN -CGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_E2 16,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_E2 16,16,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L1: -/*----------------------------------------*/ - - andi. J, N, 1 - ble CGEMM_L1_END - -CGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble CGEMM_L1x8_SUB0 - bl CGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L1x8_SAVE - b CGEMM_L1x8_SUB2 - - -CGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-8 - addi AO,AO,-64 - LOAD1x8O 64,8 - END1x8_WITHOUT_ADD - LOAD1x8_2O 128, 16 - mtctr T8 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8_2O 128,16 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - MY_ALIGN - - -CGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L1x8_SUB2_32 - bl CGEMM_1x8_L64_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L1x8_SUB2_16 - bl CGEMM_1x8_L32_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x8_SUB2_8 - bl CGEMM_1x8_L16_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_L2 128,16, 1,0 - KERNEL1x8_L2 128,16, 2,0 - KERNEL1x8_E2 128,16, 3,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_E2 128,16, 1,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 128,16, 0,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x8_SAVE - KERNEL1x8 - - MY_ALIGN -CGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt CGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END - b CGEMM_L1x4_BEGIN - MY_ALIGN - - -CGEMM_L1x8_END: -/*----------------------------------------*/ - - -CGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x4 - ble CGEMM_L1x4_SUB0 - bl CGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x4_SAVE - b CGEMM_L1x4_SUB2 - - -CGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-8 - addi AO,AO,-32 - LOAD1x4O 32,8 - END1x4_WITHOUT_ADD - LOAD1x4_2O 64, 16 - mtctr T8 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4_2O 64,16 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x4_SUB2_8 - bl CGEMM_1x4_L16_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x4_SUB2_4 - bl CGEMM_1x4_L8_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 64,16, 0,0 - KERNEL1x4_E2 64,16, 1,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 64,16, 0,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x4_SAVE - KERNEL1x4 - - -CGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -CGEMM_L1x4_END: -/*----------------------------------------*/ - - -CGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x2 - ble CGEMM_L1x2_SUB0 - bl CGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x2_SAVE - b CGEMM_L1x2_SUB2 - - -CGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-8 - addi AO,AO,-16 - LOAD1x2O 16,8 - END1x2_WITHOUT_ADD - LOAD1x2_2O 32, 16 - mtctr T8 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2_2O 32,16 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x2_SUB2_8 - bl CGEMM_1x2_L16_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x2_SUB2_4 - bl CGEMM_1x2_L8_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 32,16, 0,0 - KERNEL1x2_E2 32,16, 1,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 32,16, 0,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x2_SAVE - KERNEL1x2 - - MY_ALIGN -CGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -CGEMM_L1x2_END: -/*----------------------------------------*/ - - -CGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x1 - ble CGEMM_L1x1_SUB0 - bl CGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x1_SAVE - b CGEMM_L1x1_SUB2 - - -CGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-8 - addi AO,AO,-8 - LOAD1x1O 8,8 - END1x1_WITHOUT_ADD - LOAD1x1_2O 16, 16 - mtctr T8 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1_2O 16,16 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x1_SUB2_8 - bl CGEMM_1x1_L16_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x1_SUB2_4 - bl CGEMM_1x1_L8_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 16,16, 0,0 - KERNEL1x1_E2 16,16, 1,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 16,16, 0,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x1_SAVE - KERNEL1x1 - - MY_ALIGN -CGEMM_L1x1_SAVE: -/*----------------------------------------*/ - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -CGEMM_L1x1_END: -/*----------------------------------------*/ - slwi T1, K, 3 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - -CGEMM_L1_END: - - - - +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S index a256e1a01..be2b74f01 100644 --- a/kernel/power/cgemm_macros_power9.S +++ b/kernel/power/cgemm_macros_power9.S @@ -1,3019 +1,3019 @@ - -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - - -.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - -/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmulsp \VSOUT1,\VSINII, alpha_i - xvmulsp \VSOUT2,\VSINRR, alpha_i -.endm - -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubasp \VSOUT1,\VSINRR, alpha_r - xvmaddasp \VSOUT2,\VSINII, alpha_r -.endm - -/* macros for N=4 and M=8 -**********************************************************************************************/ - -.macro Zero4x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD4x8 - LOAD4x8O 0,0 -.endm - - -.macro LOAD4x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_NORMAL - END4x8 AO,BO,64,32 -.endm - - -.macro END4x8_WITHOUT_ADD - END4x8 AO,BO,0,0 -.endm - - -.macro END4x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.endm - - -.macro LOAD4x8_2 - LOAD4x8_2O 0,0 -.endm - - -.macro LOAD4x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_2 - /*for load2 offset will be 128 and 64*/ - KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 -.endm - - -.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL4x8 - LOAD4x8 - END4x8 AO, BO, 64,32 -.endm - - -.macro SAVE4x8 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - xxperm vs2,vs50,permute_mask - xxperm vs6,vs58,permute_mask - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - xxperm vs3,vs51,permute_mask - xxperm vs7,vs59,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 - xxperm vs10,vs54,permute_mask - xxperm vs14,vs62,permute_mask - AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 - xxperm vs11,vs55,permute_mask - xxperm vs15,vs63,permute_mask - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - #ifndef TRMMKERNEL - lxv vs32 , 0(T2) - lxv vs40 , 16(T2) -#endif - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs33 , 32(T2) - lxv vs41 , 48(T2) -#endif - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 -#ifndef TRMMKERNEL - lxv vs34 , 0(T3) - lxv vs42 , 16(T3) -#endif - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs35 , 32(T3) - lxv vs43 , 48(T3) -#endif - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - MULT_APLHA_PART1 vs48,vs56,vs0,vs1 - MULT_APLHA_PART1 vs49,vs57,vs2,vs3 - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - MULT_APLHA_PART1 vs50,vs58,vs4,vs5 - MULT_APLHA_PART1 vs51,vs59,vs6,vs7 - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - MULT_APLHA_PART2 vs48,vs56,vs0,vs1 - MULT_APLHA_PART2 vs49,vs57,vs2,vs3 - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - MULT_APLHA_PART2 vs50,vs58,vs4,vs5 - MULT_APLHA_PART2 vs51,vs59,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs8,vs9 - MULT_APLHA_PART1 vs53,vs61,vs10,vs11 - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - MULT_APLHA_PART1 vs54,vs62,vs12,vs13 - MULT_APLHA_PART1 vs55,vs63,vs14,vs15 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - MULT_APLHA_PART2 vs52,vs60,vs8,vs9 - MULT_APLHA_PART2 vs53,vs61,vs10,vs11 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - MULT_APLHA_PART2 vs54,vs62,vs12,vs13 - MULT_APLHA_PART2 vs55,vs63,vs14,vs15 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs32,vs32,vs1 - xvaddsp vs40,vs40,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs33,vs33,vs5 - xvaddsp vs41,vs41,vs7 - xvaddsp vs34,vs34,vs9 - xvaddsp vs42,vs42,vs11 - xvaddsp vs35,vs35,vs13 - xvaddsp vs43,vs43,vs15 -#else - xxpermdi vs32,vs8,vs0,2 - xxpermdi vs40,vs10,vs2,2 - xxpermdi vs33,vs12,vs4,2 - xxpermdi vs41,vs14,vs6,2 - xxpermdi vs34,vs0,vs8,2 - xxpermdi vs42,vs2,vs10,2 - xxpermdi vs35,vs4,vs12,2 - xxpermdi vs43,vs6,vs14,2 -#endif - stxv vs32 , 0(T2) - stxv vs40 , 16(T2) - stxv vs33 , 32(T2) - stxv vs41 , 48(T2) - stxv vs34 , 0(T3) - stxv vs42 , 16(T3) - stxv vs35 , 32(T3) - stxv vs43 , 48(T3) - addi CO, CO, 64 -.endm - -/* macros for N=4 and M=4 -**********************************************************************************************/ - -.macro Zero4x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endm - - -.macro LOAD4x4 - LOAD4x4O 0,0 -.endm - - -.macro LOAD4x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_NORMAL - END4x4 AO,BO,32,32 -.endm - - -.macro END4x4_WITHOUT_ADD - END4x4 AO,BO,0,0 -.endm - - -.macro END4x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.endm - - -.macro LOAD4x4_2 - LOAD4x4_2O 0,0 -.endm - - -.macro LOAD4x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_2 - /*for load2 offset will be 64 and 64*/ - KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 -.endm - - -.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x4 - LOAD4x4 - END4x4 AO, BO, 32,32 -.endm - - -.macro SAVE4x4 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - #ifndef TRMMKERNEL - lxv vs28 , 0(T2) - lxv vs29 , 16(T2) -#endif -#ifndef TRMMKERNEL - lxv vs30 , 0(T3) - lxv vs31 , 16(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs48,vs56,vs4,vs5 - MULT_APLHA_PART1 vs49,vs57,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs12,vs13 - MULT_APLHA_PART1 vs53,vs61,vs14,vs15 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs48,vs56,vs4,vs5 - MULT_APLHA_PART2 vs49,vs57,vs6,vs7 - MULT_APLHA_PART2 vs52,vs60,vs12,vs13 - MULT_APLHA_PART2 vs53,vs61,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 - xvaddsp vs28,vs28,vs5 - xvaddsp vs29,vs29,vs7 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 - xxpermdi vs28,vs12,vs4,2 - xxpermdi vs29,vs14,vs6,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - stxv vs28 , 0(T2) - stxv vs29 , 16(T2) - stxv vs30 , 0(T3) - stxv vs31 , 16(T3) - addi CO, CO, 32 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD4x2 - LOAD4x2O 0,0 -.endm - - -.macro LOAD4x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_NORMAL - END4x2 AO,BO,16,32 -.endm - - -.macro END4x2_WITHOUT_ADD - END4x2 AO,BO,0,0 -.endm - - -.macro END4x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD4x2_2 - LOAD4x2_2O 0,0 -.endm - - -.macro LOAD4x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_2 - /*for load2 offset will be 32 and 64*/ - KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 -.endm - - -.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x2 - LOAD4x2 - END4x2 AO, BO, 16,32 -.endm - - -.macro SAVE4x2 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs25 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxv vs27 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs10,vs2,0 - xxpermdi vs3,vs0,vs8,3 - xxpermdi vs11,vs2,vs10,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 - xvaddsp vs25,vs25,vs3 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs10,vs2,0 - xxpermdi vs25,vs0,vs8,3 - xxpermdi vs27,vs2,vs10,3 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 0(T1) - stxv vs26 , 0(T2) - stxv vs27 , 0(T3) - addi CO, CO, 16 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD4x1 - LOAD4x1O 0,0 -.endm - - -.macro LOAD4x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END4x1_NORMAL - END4x1 AO,BO,8,32 -.endm - - -.macro END4x1_WITHOUT_ADD - END4x1 AO,BO,0,0 -.endm - - -.macro END4x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD4x1_2 - LOAD4x1_2O 0,0 -.endm - - -.macro LOAD4x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) -.endm - - -.macro END4x1_2 - /*for load2 offset will be 16 and 64*/ - KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 -.endm - - -.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x1 - LOAD4x1 - END4x1 AO, BO, 8,32 -.endm - - -.macro SAVE4x1 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxsd v6 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxsd v7 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - xxspltd vs9,vs2,0 - xxspltd vs11,vs2,1 - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 - xvaddsp vs38,vs38,vs9 - xvaddsp vs39,vs39,vs11 -#else - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 - xxspltd vs38,vs2,0 - xxspltd vs39,vs2,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - stxsd v6 , 0(T2) - stxsd v7 , 0(T3) - addi CO, CO, 8 -.endm - -/* macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,64,16 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_2 - /*for load2 offset will be 128 and 32*/ - KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 -.endm - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 64,16 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - addi CO, CO, 64 -.endm - -/* macros for N=2 and M=4 -**********************************************************************************************/ - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,32,16 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_2 - /*for load2 offset will be 64 and 32*/ - KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 -.endm - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 32,16 -.endm - - -.macro SAVE2x4 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - addi CO, CO, 32 -.endm - -/* macros for N=2 and M=2 -**********************************************************************************************/ - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs36, vs36, vs36 - xxlxor vs40, vs40, vs40 - xxlxor vs44, vs44, vs44 -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,16,16 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_2 - /*for load2 offset will be 32 and 32*/ - KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 -.endm - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs44, vs0,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 16,16 -.endm - - -.macro SAVE2x2 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs8,vs9, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs0,vs8,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs0,vs8,3 -#endif - stxv vs24 , 0(CO) - stxv vs26 , 0(T1) - addi CO, CO, 16 -.endm - -/* macros for N=2 and M=1 -**********************************************************************************************/ - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,8,16 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_2 - /*for load2 offset will be 16 and 32*/ - KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 -.endm - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 8,16 -.endm - - -.macro SAVE2x1 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - /*--v4==vs36 v5==vs37---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 -#else - /*--v4==vs36 v5==vs37---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - addi CO, CO, 8 -.endm - -/* macros for N=1 and M=8 -**********************************************************************************************/ - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,64,8 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_2 - /*for load2 offset will be 128 and 16*/ - KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 -.endm - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 64,8 -.endm - - -.macro SAVE1x8 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 - xxperm vs4,vs5, vs28 - xxperm vs6,vs7, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - xvaddsp vs26,vs26,vs4 - xvaddsp vs27,vs27,vs6 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) - stxv vs4 , 32(CO) - stxv vs6 , 48(CO) -#endif - addi CO, CO, 64 -.endm - -/* macros for N=1 and M=4 -**********************************************************************************************/ - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,32,8 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_2 - /*for load2 offset will be 64 and 16*/ - KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 -.endm - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 32,8 -.endm - - -.macro SAVE1x4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) -#endif - addi CO, CO, 32 -.endm - -/* macros for N=1 and M=2 -**********************************************************************************************/ - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,16,8 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs0, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_2 - /*for load2 offset will be 32 and 16*/ - KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 -.endm - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP4(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 16,8 -.endm - - -.macro SAVE1x2 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - stxv vs24 , 0(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) -#endif - addi CO, CO, 16 -.endm - -/* macros for N=1 and M=1 -**********************************************************************************************/ -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxsd v4, (\OffsetB+0)(BO) - lxsd v5, (\OffsetA+0)(AO) - xxperm vs38, vs36, permute_mask -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,8,8 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs37,vs36 - xvmaddasp vs40, vs37,vs38 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask -.endm - - -.macro END1x1_2 - /*for load2 offset will be 16 and 16*/ - KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 -.endm - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP2(\Index,\OffsetB)(\BREG) - lxv vs4, DISP2(\Index,\OffsetB)(\AREG) - xxperm vs10, vs8, permute_mask -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP2(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP2(\Index,16) -.endif - -.endif -.endm - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 8,8 -.endm - - -.macro SAVE1x1 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - /*aggregate x2*/ - xxpermdi vs33,vs32,vs32,2 - xxpermdi vs41,vs40,vs40,2 - xvaddsp vs32,vs32,vs33 - xvaddsp vs40,vs40,vs41 - - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs37,vs1 - MULT_APLHA_PART2 vs32,vs40,vs37,vs1 - -/* reconstruct r,i pairs*/ - xxperm vs37,vs1, vs28 - -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs36,vs36,vs37 - stxsd v4 , 0(CO) -#else - -/* vs37 is v5 */ - stxsd v5 , 0(CO) -#endif - addi CO, CO, 8 -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*8; -// ptrbb = bb + off*4; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+8; // number of values in A -// #else -// temp = off+4; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 8; // number of values in A -// #else -// temp -= 4; // number of values in B -// #endif -// ptrba += temp*8; -// ptrbb += temp*4; -// #endif - -// #ifdef LEFT -// off += 8; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif + +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmulsp \VSOUT1,\VSINII, alpha_i + xvmulsp \VSOUT2,\VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubasp \VSOUT1,\VSINRR, alpha_r + xvmaddasp \VSOUT2,\VSINII, alpha_r +.endm + +/* macros for N=4 and M=8 +**********************************************************************************************/ + +.macro Zero4x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD4x8 + LOAD4x8O 0,0 +.endm + + +.macro LOAD4x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_NORMAL + END4x8 AO,BO,64,32 +.endm + + +.macro END4x8_WITHOUT_ADD + END4x8 AO,BO,0,0 +.endm + + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.endm + + +.macro LOAD4x8_2 + LOAD4x8_2O 0,0 +.endm + + +.macro LOAD4x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 +.endm + + +.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64,32 +.endm + + +.macro SAVE4x8 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + xxperm vs2,vs50,permute_mask + xxperm vs6,vs58,permute_mask + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + xxperm vs3,vs51,permute_mask + xxperm vs7,vs59,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 + xxperm vs10,vs54,permute_mask + xxperm vs14,vs62,permute_mask + AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 + xxperm vs11,vs55,permute_mask + xxperm vs15,vs63,permute_mask + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + #ifndef TRMMKERNEL + lxv vs32 , 0(T2) + lxv vs40 , 16(T2) +#endif + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs33 , 32(T2) + lxv vs41 , 48(T2) +#endif + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 +#ifndef TRMMKERNEL + lxv vs34 , 0(T3) + lxv vs42 , 16(T3) +#endif + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs35 , 32(T3) + lxv vs43 , 48(T3) +#endif + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + MULT_APLHA_PART1 vs48,vs56,vs0,vs1 + MULT_APLHA_PART1 vs49,vs57,vs2,vs3 + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + MULT_APLHA_PART1 vs50,vs58,vs4,vs5 + MULT_APLHA_PART1 vs51,vs59,vs6,vs7 + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + MULT_APLHA_PART2 vs48,vs56,vs0,vs1 + MULT_APLHA_PART2 vs49,vs57,vs2,vs3 + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + MULT_APLHA_PART2 vs50,vs58,vs4,vs5 + MULT_APLHA_PART2 vs51,vs59,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs8,vs9 + MULT_APLHA_PART1 vs53,vs61,vs10,vs11 + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + MULT_APLHA_PART1 vs54,vs62,vs12,vs13 + MULT_APLHA_PART1 vs55,vs63,vs14,vs15 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + MULT_APLHA_PART2 vs52,vs60,vs8,vs9 + MULT_APLHA_PART2 vs53,vs61,vs10,vs11 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + MULT_APLHA_PART2 vs54,vs62,vs12,vs13 + MULT_APLHA_PART2 vs55,vs63,vs14,vs15 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs32,vs32,vs1 + xvaddsp vs40,vs40,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs33,vs33,vs5 + xvaddsp vs41,vs41,vs7 + xvaddsp vs34,vs34,vs9 + xvaddsp vs42,vs42,vs11 + xvaddsp vs35,vs35,vs13 + xvaddsp vs43,vs43,vs15 +#else + xxpermdi vs32,vs8,vs0,2 + xxpermdi vs40,vs10,vs2,2 + xxpermdi vs33,vs12,vs4,2 + xxpermdi vs41,vs14,vs6,2 + xxpermdi vs34,vs0,vs8,2 + xxpermdi vs42,vs2,vs10,2 + xxpermdi vs35,vs4,vs12,2 + xxpermdi vs43,vs6,vs14,2 +#endif + stxv vs32 , 0(T2) + stxv vs40 , 16(T2) + stxv vs33 , 32(T2) + stxv vs41 , 48(T2) + stxv vs34 , 0(T3) + stxv vs42 , 16(T3) + stxv vs35 , 32(T3) + stxv vs43 , 48(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro Zero4x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endm + + +.macro LOAD4x4 + LOAD4x4O 0,0 +.endm + + +.macro LOAD4x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_NORMAL + END4x4 AO,BO,32,32 +.endm + + +.macro END4x4_WITHOUT_ADD + END4x4 AO,BO,0,0 +.endm + + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.endm + + +.macro LOAD4x4_2 + LOAD4x4_2O 0,0 +.endm + + +.macro LOAD4x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 +.endm + + +.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32,32 +.endm + + +.macro SAVE4x4 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + #ifndef TRMMKERNEL + lxv vs28 , 0(T2) + lxv vs29 , 16(T2) +#endif +#ifndef TRMMKERNEL + lxv vs30 , 0(T3) + lxv vs31 , 16(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs48,vs56,vs4,vs5 + MULT_APLHA_PART1 vs49,vs57,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs12,vs13 + MULT_APLHA_PART1 vs53,vs61,vs14,vs15 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs48,vs56,vs4,vs5 + MULT_APLHA_PART2 vs49,vs57,vs6,vs7 + MULT_APLHA_PART2 vs52,vs60,vs12,vs13 + MULT_APLHA_PART2 vs53,vs61,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 + xvaddsp vs28,vs28,vs5 + xvaddsp vs29,vs29,vs7 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 + xxpermdi vs28,vs12,vs4,2 + xxpermdi vs29,vs14,vs6,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + stxv vs28 , 0(T2) + stxv vs29 , 16(T2) + stxv vs30 , 0(T3) + stxv vs31 , 16(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD4x2 + LOAD4x2O 0,0 +.endm + + +.macro LOAD4x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_NORMAL + END4x2 AO,BO,16,32 +.endm + + +.macro END4x2_WITHOUT_ADD + END4x2 AO,BO,0,0 +.endm + + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD4x2_2 + LOAD4x2_2O 0,0 +.endm + + +.macro LOAD4x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16,32 +.endm + + +.macro SAVE4x2 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs10,vs2,0 + xxpermdi vs3,vs0,vs8,3 + xxpermdi vs11,vs2,vs10,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 + xvaddsp vs25,vs25,vs3 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs10,vs2,0 + xxpermdi vs25,vs0,vs8,3 + xxpermdi vs27,vs2,vs10,3 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 0(T1) + stxv vs26 , 0(T2) + stxv vs27 , 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD4x1 + LOAD4x1O 0,0 +.endm + + +.macro LOAD4x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END4x1_NORMAL + END4x1 AO,BO,8,32 +.endm + + +.macro END4x1_WITHOUT_ADD + END4x1 AO,BO,0,0 +.endm + + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD4x1_2 + LOAD4x1_2O 0,0 +.endm + + +.macro LOAD4x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) +.endm + + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 +.endm + + +.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8,32 +.endm + + +.macro SAVE4x1 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + xxspltd vs9,vs2,0 + xxspltd vs11,vs2,1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 + xvaddsp vs38,vs38,vs9 + xvaddsp vs39,vs39,vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 + xxspltd vs38,vs2,0 + xxspltd vs39,vs2,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + stxsd v6 , 0(T2) + stxsd v7 , 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,64,16 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 +.endm + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64,16 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,32,16 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 +.endm + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32,16 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs36, vs36, vs36 + xxlxor vs40, vs40, vs40 + xxlxor vs44, vs44, vs44 +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,16,16 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 +.endm + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs44, vs0,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16,16 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs8,vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs0,vs8,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs0,vs8,3 +#endif + stxv vs24 , 0(CO) + stxv vs26 , 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,8,16 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8,16 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,64,8 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 +.endm + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + + +.macro SAVE1x8 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 + xxperm vs4,vs5, vs28 + xxperm vs6,vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + xvaddsp vs26,vs26,vs4 + xvaddsp vs27,vs27,vs6 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) + stxv vs4 , 32(CO) + stxv vs6 , 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,32,8 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 +.endm + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + + +.macro SAVE1x4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,16,8 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs0, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 +.endm + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + stxv vs24 , 0(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,8,8 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs37,vs36 + xvmaddasp vs40, vs37,vs38 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 +.endm + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index,\OffsetB)(\BREG) + lxv vs4, DISP2(\Index,\OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP2(\Index,16) +.endif + +.endif +.endm + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33,vs32,vs32,2 + xxpermdi vs41,vs40,vs40,2 + xvaddsp vs32,vs32,vs33 + xvaddsp vs40,vs40,vs41 + + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs37,vs1 + MULT_APLHA_PART2 vs32,vs40,vs37,vs1 + +/* reconstruct r,i pairs*/ + xxperm vs37,vs1, vs28 + +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36,vs36,vs37 + stxsd v4 , 0(CO) +#else + +/* vs37 is v5 */ + stxsd v5 , 0(CO) +#endif + addi CO, CO, 8 +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index 8663039c5..575847da2 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -1,597 +1,597 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/zgemv_n.c" -#else - -#include -#include -#include "common.h" -#include -#define NBMAX 1024 - - -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; - register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; - register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; - register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; - register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; - register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; - register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; - register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; - register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - register __vector float *vptr_a2 = (__vector float *) a2; - register __vector float *vptr_a3 = (__vector float *) a3; - BLASLONG i = 0; - BLASLONG i2=16; - for (;i< n * 8; i+=32,i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; - va0 = vec_perm(va0, va0,swap_mask); - va0_1 = vec_perm(va0_1, va0_1,swap_mask); - va1 = vec_perm(va1, va1,swap_mask); - va1_1 = vec_perm(va1_1, va1_1,swap_mask); - va2 = vec_perm(va2, va2,swap_mask); - va2_1 = vec_perm(va2_1, va2_1,swap_mask); - va3 = vec_perm(va3, va3,swap_mask); - va3_1 = vec_perm(va3_1, va3_1,swap_mask); - vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; - vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - register __vector float va1x = vec_perm(va1, va1,swap_mask); - register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); - vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) ap; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - vy_0 += va0*vx0_r + va0x*vx0_i; - vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } -} - - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i=0; - - - if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i +#include +#include "common.h" +#include +#define NBMAX 1024 + + +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; + register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; + register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; + register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; + register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; + register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; + register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; + register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; + register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + register __vector float *vptr_a2 = (__vector float *) a2; + register __vector float *vptr_a3 = (__vector float *) a3; + BLASLONG i = 0; + BLASLONG i2=16; + for (;i< n * 8; i+=32,i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; + va0 = vec_perm(va0, va0,swap_mask); + va0_1 = vec_perm(va0_1, va0_1,swap_mask); + va1 = vec_perm(va1, va1,swap_mask); + va1_1 = vec_perm(va1_1, va1_1,swap_mask); + va2 = vec_perm(va2, va2,swap_mask); + va2_1 = vec_perm(va2_1, va2_1,swap_mask); + va3 = vec_perm(va3, va3,swap_mask); + va3_1 = vec_perm(va3_1, va3_1,swap_mask); + vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; + vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + register __vector float va1x = vec_perm(va1, va1,swap_mask); + register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); + vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) ap; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } +} + + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i=0; + + + if (inc_dest != 2) { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* vptr_a2 = (__vector float*) a2; - __vector float* vptr_a3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - vtemp2_p += vx_0*va2 + vx_1*va2_1; - vtemp2_r += vxr_0*va2 + vxr_1*va2_1; - vtemp3_p += vx_0*va3 + vx_1*va3_1; - vtemp3_r += vxr_0*va3 + vxr_1*va3_1; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - -#endif - -} - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - - - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - - } -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - -#endif - -} - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) ap; - __vector float* v_x = (__vector float*) x; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - -#endif - - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i=0; - BLASLONG j=0; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return (0); - } - - if (m3 == 1) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); - -} -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else + +#include "common.h" + +#define NBMAX 1024 +#include +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* vptr_a2 = (__vector float*) a2; + __vector float* vptr_a3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + vtemp2_p += vx_0*va2 + vx_1*va2_1; + vtemp2_r += vxr_0*va2 + vxr_1*va2_1; + vtemp3_p += vx_0*va3 + vx_1*va3_1; + vtemp3_r += vxr_0*va3 + vxr_1*va3_1; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif + +} + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + + + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif + +} + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) ap; + __vector float* v_x = (__vector float*) x; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i=0; + BLASLONG j=0; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 84ba5d913..dbd7e3482 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -1,233 +1,233 @@ -/*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if defined(POWER8) || defined(POWER9) || defined(POWER10) -#if defined(__VEC__) || defined(__ALTIVEC__) - -static void crot_kernel_8 (long n, float *x, float *y, float c, float s) -{ - __vector float t0; - __vector float t1; - __vector float t2; - __vector float t3; - __vector float t4; - __vector float t5; - __vector float t6; - __vector float t7; - __asm__ - ( - "xscvdpspn 36, %x[cos] \n\t" // load c to all words - "xxspltw 36, 36, 0 \n\t" - "xscvdpspn 37, %x[sin] \n\t" // load s to all words - "xxspltw 37, 37, 0 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 64 \n\t" - "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble two%= \n\t" - ".align 5 \n\t" - "one%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 128 \n\t" - "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt one%= \n\t" - "two%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] " - : - [mem_x] "+m" (*(float (*)[2*n])x), - [mem_y] "+m" (*(float (*)[2*n])y), - [temp_n] "+r" (n), - [x_ptr] "+&b" (x), - [y_ptr] "+&b" (y), - [x0] "=wa" (t0), - [x1] "=wa" (t2), - [x2] "=wa" (t1), - [x3] "=wa" (t3), - [x4] "=wa" (t4), - [x5] "=wa" (t5), - [x6] "=wa" (t6), - [x7] "=wa" (t7) - : - [cos] "f" (c), - [sin] "f" (s), - [i16] "b" (16), - [i32] "b" (32), - [i48] "b" (48) - : - "cr0", - "vs32","vs33","vs34","vs35","vs36","vs37", - "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51" - ); -} - -#endif -#endif - - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { -#if defined(__VEC__) || defined(__ALTIVEC__) - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x, y, c, s); - i=n1; - ix=2*n1; - } -#endif - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; - - } - - } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - } - return(0); -} - +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "bgt one%= \n\t" + "two%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } +#endif + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } + return(0); +} + diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index 2fb1b27ef..86108f20c 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -1,249 +1,249 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld - - - - -#define STACKSIZE (512 ) -#define ALPHA_SP (296+192)(SP) -#define FZERO (304+192)(SP) - - - -#define M r3 -#define N r4 -#define K r5 - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs18 - -#define o0 0 - - -#define T4 r12 -#define T3 r11 -#define C4 r14 -#define o8 r15 -#define o24 r16 -#define C2 r17 -#define L r18 -#define T1 r19 -#define C3 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define o16 r27 -#define o32 r28 -#define o48 r29 - -#define PRE r30 -#define T2 r31 - -#include "dgemm_macros_power9.S" - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - - stfd f1, ALPHA_SP - stw r0, FZERO - - slwi LDC, LDC, BASE_SHIFT - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - - - cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 - - - - addi T1, SP, 296+192 - - - li PRE, 384 - li o8 , 8 - li o16, 16 - li o24, 24 - li o32, 32 - li o48, 48 - - - lxvdsx alpha_r, 0, T1 - -#include "dgemm_logic_power9.S" - -.L999: - addi r3, 0, 0 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld + + + + +#define STACKSIZE (512 ) +#define ALPHA_SP (296+192)(SP) +#define FZERO (304+192)(SP) + + + +#define M r3 +#define N r4 +#define K r5 + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs18 + +#define o0 0 + + +#define T4 r12 +#define T3 r11 +#define C4 r14 +#define o8 r15 +#define o24 r16 +#define C2 r17 +#define L r18 +#define T1 r19 +#define C3 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_power9.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + + stfd f1, ALPHA_SP + stw r0, FZERO + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + + + addi T1, SP, 296+192 + + + li PRE, 384 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + + lxvdsx alpha_r, 0, T1 + +#include "dgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S index 251839d19..a48bc685a 100644 --- a/kernel/power/dgemm_logic_power9.S +++ b/kernel/power/dgemm_logic_power9.S @@ -1,1981 +1,1981 @@ -/*************************************************************************** -Copyright (c) 2013-2019 The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#define MY_ALIGN .align 3 - -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 2 - ble LDGEMM_L4_END - -LDGEMM_L4_BEGIN: - - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LDGEMM_L4x16_END - - MY_ALIGN -LDGEMM_L4x16_BEGIN: - - li L, -128 - - - SAVE4x16_REGS - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - - and T1, CO, L - and T2, C2, L - and T3, C3, L - and T4, C4, L - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - - - addi T1, T1, 128 - addi T2, T2, 128 - addi T3, T3, 128 - addi T4, T4, 128 - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 - srawi. L, T3, 5 -#else - srawi. L, K, 5 -#endif - - ble LDGEMM_L4x16_SUB0 - - - MY_ALIGN -LDGEMM_L4x16_LOOP_START: - - li T2, 512 - - - LOAD4x16_1 - ##OffsetA=128 OffsetB=32 - addi AO,AO,2176 - # addi BO,BO,32 - addic. L, L, -1 - - ble LDGEMM_L4x16_LOOP_END - - - mtctr L - - MY_ALIGN - -LDGEMM_L4x16_LOOP: - - #dcbt AO, PRE - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_2 -2048,32, 15,1 - - - bdnz LDGEMM_L4x16_LOOP - - MY_ALIGN - MY_ALIGN -LDGEMM_L4x16_LOOP_END: - - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_3 -2048,32, 15,1 - b LDGEMM_L4x16_SUB1 - - - MY_ALIGN -LDGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - KERNEL4x16 1 - - addic. L, L, -1 - ble LDGEMM_L4x16_SAVE - b LDGEMM_L4x16_SUB2 - MY_ALIGN -LDGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - ble LDGEMM_L4x16_SAVE - MY_ALIGN -LDGEMM_L4x16_SUB2: - - andi. T1,L, 16 - ble LDGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_2 128,32, 3,0 - KERNEL4x16_I1_L2_2 128,32, 4,0 - KERNEL4x16_I1_L2_2 128,32, 5,0 - KERNEL4x16_I1_L2_2 128,32, 6,0 - KERNEL4x16_I1_L2_3 128,32, 7,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_8: - andi. T1,L, 8 - ble LDGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_3 128,32, 3,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_3 128,32, 1,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 128,32, 0,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LDGEMM_L4x16_SUB2 - - MY_ALIGN -LDGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LDGEMM_L4x16_BEGIN - -LDGEMM_L4x16_END: - -LDGEMM_L4x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L4x1_END - - andi. T1, M, 8 - ble LDGEMM_L4x8_END - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 - srawi. L, T3, 4 -#else - mr BO, B - srawi. L, K, 4 -#endif - - - ble LDGEMM_L4x8_SUB0 - -LDGEMM_L4x8_LOOP_START: - - - LOAD4x8_1 - ##OffsetA=64 OffsetB=32 - - - addic. L, L, -1 - - ble LDGEMM_L4x8_LOOP_END - - mtctr L - MY_ALIGN - -LDGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_2 64,32, 7,1 - - bdnz LDGEMM_L4x8_LOOP - MY_ALIGN -LDGEMM_L4x8_LOOP_END: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_3 64,32, 7,1 - - b LDGEMM_L4x8_SUB1 - MY_ALIGN -LDGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - KERNEL4x8 1 - - addic. L, L, -1 - ble LDGEMM_L4x8_SAVE - b LDGEMM_L4x8_SUB2 - MY_ALIGN -LDGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - ble LDGEMM_L4x8_SAVE - MY_ALIGN -LDGEMM_L4x8_SUB2: - - andi. T1,L, 8 - ble LDGEMM_L4x8_SUB2_4 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_3 64,32, 3,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_3 64,32, 1,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 64,32, 0,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x8_SAVE - KERNEL4x8 0 - - MY_ALIGN -LDGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 -#endif -LDGEMM_L4x8_END: - -LDGEMM_L4x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x4_SUB4 - -LDGEMM_L4x4_LOOP_START: - - #dcbt AO, PRE - LOAD4x4_1 - KERNEL4x4_I1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -2 - ble LDGEMM_L4x4_LOOP_END - - MY_ALIGN - -LDGEMM_L4x4_LOOP: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -1 - bgt LDGEMM_L4x4_LOOP - -LDGEMM_L4x4_LOOP_END: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_E2 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB4: - - KERNEL4x4_SUBI1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x4_SAVE - b LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x4_SAVE - -LDGEMM_L4x4_SUB2: - - KERNEL4x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SAVE: - - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 -#endif -LDGEMM_L4x4_END: - -LDGEMM_L4x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x2_SUB4 - -LDGEMM_L4x2_LOOP_START: - - LOAD4x2_1 - KERNEL4x2_I1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -2 - ble LDGEMM_L4x2_LOOP_END - - MY_ALIGN - -LDGEMM_L4x2_LOOP: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -1 - bgt LDGEMM_L4x2_LOOP - -LDGEMM_L4x2_LOOP_END: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_E2 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB4: - - KERNEL4x2_SUBI1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x2_SAVE - b LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x2_SAVE - -LDGEMM_L4x2_SUB2: - - KERNEL4x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SAVE: - - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 -#endif -LDGEMM_L4x2_END: - -LDGEMM_L4x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x1_SUB4 - -LDGEMM_L4x1_LOOP_START: - - LOAD4x1_1 - KERNEL4x1_I1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -2 - ble LDGEMM_L4x1_LOOP_END - - MY_ALIGN - -LDGEMM_L4x1_LOOP: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -1 - bgt LDGEMM_L4x1_LOOP - -LDGEMM_L4x1_LOOP_END: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_E2 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB4: - - KERNEL4x1_SUBI1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x1_SAVE - b LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x1_SAVE - -LDGEMM_L4x1_SUB2: - - KERNEL4x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SAVE: - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 -#endif -LDGEMM_L4x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - addic. J, J, -1 - bgt LDGEMM_L4_BEGIN - - andi. T2, N, 3 - ble .L999 - -LDGEMM_L4_END: - - b LDGEMM_L2_BEGIN - -.L999_H1: - - b .L999 - -LDGEMM_L2_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 2 - ble LDGEMM_L2_END - mr CO, C - mr AO, A - slwi T1, LDC , 1 - add C, C, T1 - srawi. I, M, 4 - ble LDGEMM_L2x16_END - -LDGEMM_L2x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x16_SUB4 - -LDGEMM_L2x16_LOOP_START: - - #dcbt AO, PRE - LOAD2x16_1 - #dcbt AO, PRE - KERNEL2x16_I1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -2 - ble LDGEMM_L2x16_LOOP_END - - MY_ALIGN - -LDGEMM_L2x16_LOOP: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -1 - bgt LDGEMM_L2x16_LOOP - -LDGEMM_L2x16_LOOP_END: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - KERNEL2x16_E2 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB4: - - #dcbt AO, PRE - KERNEL2x16_SUBI1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x16_SAVE - b LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x16_SAVE - -LDGEMM_L2x16_SUB2: - - KERNEL2x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SAVE: - - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt LDGEMM_L2x16_BEGIN - -LDGEMM_L2x16_END: - -LDGEMM_L2x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L2x1_END - - andi. T1, M, 8 - ble LDGEMM_L2x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x8_SUB4 - -LDGEMM_L2x8_LOOP_START: - - #dcbt AO, PRE - LOAD2x8_1 - KERNEL2x8_I1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -2 - ble LDGEMM_L2x8_LOOP_END - - MY_ALIGN - -LDGEMM_L2x8_LOOP: - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -1 - bgt LDGEMM_L2x8_LOOP - -LDGEMM_L2x8_LOOP_END: - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_2 - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_E2 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB4: - - KERNEL2x8_SUBI1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x8_SAVE - b LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x8_SAVE - -LDGEMM_L2x8_SUB2: - - KERNEL2x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SAVE: - - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 -#endif -LDGEMM_L2x8_END: - -LDGEMM_L2x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x4_SUB4 - -LDGEMM_L2x4_LOOP_START: - - LOAD2x4_1 - KERNEL2x4_I1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -2 - ble LDGEMM_L2x4_LOOP_END - - MY_ALIGN - -LDGEMM_L2x4_LOOP: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -1 - bgt LDGEMM_L2x4_LOOP - -LDGEMM_L2x4_LOOP_END: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_E2 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB4: - - KERNEL2x4_SUBI1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x4_SAVE - b LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x4_SAVE - -LDGEMM_L2x4_SUB2: - - KERNEL2x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SAVE: - - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 -#endif -LDGEMM_L2x4_END: - -LDGEMM_L2x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x2_SUB4 - -LDGEMM_L2x2_LOOP_START: - - LOAD2x2_1 - KERNEL2x2_I1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -2 - ble LDGEMM_L2x2_LOOP_END - - MY_ALIGN - -LDGEMM_L2x2_LOOP: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -1 - bgt LDGEMM_L2x2_LOOP - -LDGEMM_L2x2_LOOP_END: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_E2 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB4: - - KERNEL2x2_SUBI1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x2_SAVE - b LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x2_SAVE - -LDGEMM_L2x2_SUB2: - - KERNEL2x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SAVE: - - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 -#endif -LDGEMM_L2x2_END: - -LDGEMM_L2x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x1_SUB4 - -LDGEMM_L2x1_LOOP_START: - - LOAD2x1_1 - KERNEL2x1_I1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -2 - ble LDGEMM_L2x1_LOOP_END - - MY_ALIGN - -LDGEMM_L2x1_LOOP: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -1 - bgt LDGEMM_L2x1_LOOP - -LDGEMM_L2x1_LOOP_END: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_E2 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB4: - - KERNEL2x1_SUBI1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x1_SAVE - b LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x1_SAVE - -LDGEMM_L2x1_SUB2: - - KERNEL2x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SAVE: - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 -#endif -LDGEMM_L2x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LDGEMM_L2_END: -LDGEMM_L1_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 1 - ble LDGEMM_L1_END - mr CO, C - mr AO, A - srawi. I, M, 4 - ble LDGEMM_L1x16_END - -LDGEMM_L1x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x16_SUB4 - -LDGEMM_L1x16_LOOP_START: - - #dcbt AO, PRE - LOAD1x16_1 - #dcbt AO, PRE - KERNEL1x16_I1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -2 - ble LDGEMM_L1x16_LOOP_END - - MY_ALIGN - -LDGEMM_L1x16_LOOP: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -1 - bgt LDGEMM_L1x16_LOOP - -LDGEMM_L1x16_LOOP_END: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - KERNEL1x16_E2 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB4: - - #dcbt AO, PRE - KERNEL1x16_SUBI1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x16_SAVE - b LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x16_SAVE - -LDGEMM_L1x16_SUB2: - - KERNEL1x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SAVE: - - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt LDGEMM_L1x16_BEGIN - -LDGEMM_L1x16_END: - -LDGEMM_L1x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L1x1_END - - andi. T1, M, 8 - ble LDGEMM_L1x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x8_SUB4 - -LDGEMM_L1x8_LOOP_START: - - #dcbt AO, PRE - LOAD1x8_1 - KERNEL1x8_I1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -2 - ble LDGEMM_L1x8_LOOP_END - - MY_ALIGN - -LDGEMM_L1x8_LOOP: - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -1 - bgt LDGEMM_L1x8_LOOP - -LDGEMM_L1x8_LOOP_END: - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_2 - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_E2 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB4: - - KERNEL1x8_SUBI1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x8_SAVE - b LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x8_SAVE - -LDGEMM_L1x8_SUB2: - - KERNEL1x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SAVE: - - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 -#endif -LDGEMM_L1x8_END: - -LDGEMM_L1x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x4_SUB4 - -LDGEMM_L1x4_LOOP_START: - - LOAD1x4_1 - KERNEL1x4_I1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -2 - ble LDGEMM_L1x4_LOOP_END - - MY_ALIGN - -LDGEMM_L1x4_LOOP: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -1 - bgt LDGEMM_L1x4_LOOP - -LDGEMM_L1x4_LOOP_END: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_E2 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB4: - - KERNEL1x4_SUBI1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x4_SAVE - b LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x4_SAVE - -LDGEMM_L1x4_SUB2: - - KERNEL1x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SAVE: - - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 -#endif -LDGEMM_L1x4_END: - -LDGEMM_L1x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x2_SUB4 - -LDGEMM_L1x2_LOOP_START: - - LOAD1x2_1 - KERNEL1x2_I1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -2 - ble LDGEMM_L1x2_LOOP_END - - MY_ALIGN - -LDGEMM_L1x2_LOOP: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -1 - bgt LDGEMM_L1x2_LOOP - -LDGEMM_L1x2_LOOP_END: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_E2 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB4: - - KERNEL1x2_SUBI1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x2_SAVE - b LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x2_SAVE - -LDGEMM_L1x2_SUB2: - - KERNEL1x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SAVE: - - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 -#endif -LDGEMM_L1x2_END: - -LDGEMM_L1x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x1_SUB4 - -LDGEMM_L1x1_LOOP_START: - - LOAD1x1_1 - KERNEL1x1_I1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -2 - ble LDGEMM_L1x1_LOOP_END - - MY_ALIGN - -LDGEMM_L1x1_LOOP: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -1 - bgt LDGEMM_L1x1_LOOP - -LDGEMM_L1x1_LOOP_END: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_E2 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB4: - - KERNEL1x1_SUBI1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x1_SAVE - b LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x1_SAVE - -LDGEMM_L1x1_SUB2: - - KERNEL1x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SAVE: - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 -#endif -LDGEMM_L1x1_END: -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif -LDGEMM_L1_END: +/*************************************************************************** +Copyright (c) 2013-2019 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 2 + ble LDGEMM_L4_END + +LDGEMM_L4_BEGIN: + + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN: + + li L, -128 + + + SAVE4x16_REGS + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + + and T1, CO, L + and T2, C2, L + and T3, C3, L + and T4, C4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 + srawi. L, T3, 5 +#else + srawi. L, K, 5 +#endif + + ble LDGEMM_L4x16_SUB0 + + + MY_ALIGN +LDGEMM_L4x16_LOOP_START: + + li T2, 512 + + + LOAD4x16_1 + ##OffsetA=128 OffsetB=32 + addi AO,AO,2176 + # addi BO,BO,32 + addic. L, L, -1 + + ble LDGEMM_L4x16_LOOP_END + + + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + #dcbt AO, PRE + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_2 -2048,32, 15,1 + + + bdnz LDGEMM_L4x16_LOOP + + MY_ALIGN + MY_ALIGN +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_3 -2048,32, 15,1 + b LDGEMM_L4x16_SUB1 + + + MY_ALIGN +LDGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + KERNEL4x16 1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 + MY_ALIGN +LDGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + ble LDGEMM_L4x16_SAVE + MY_ALIGN +LDGEMM_L4x16_SUB2: + + andi. T1,L, 16 + ble LDGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_2 128,32, 3,0 + KERNEL4x16_I1_L2_2 128,32, 4,0 + KERNEL4x16_I1_L2_2 128,32, 5,0 + KERNEL4x16_I1_L2_2 128,32, 6,0 + KERNEL4x16_I1_L2_3 128,32, 7,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_8: + andi. T1,L, 8 + ble LDGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_3 128,32, 3,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_3 128,32, 1,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 128,32, 0,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN +LDGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LDGEMM_L4x16_BEGIN + +LDGEMM_L4x16_END: + +LDGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L4x1_END + + andi. T1, M, 8 + ble LDGEMM_L4x8_END + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 + srawi. L, T3, 4 +#else + mr BO, B + srawi. L, K, 4 +#endif + + + ble LDGEMM_L4x8_SUB0 + +LDGEMM_L4x8_LOOP_START: + + + LOAD4x8_1 + ##OffsetA=64 OffsetB=32 + + + addic. L, L, -1 + + ble LDGEMM_L4x8_LOOP_END + + mtctr L + MY_ALIGN + +LDGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_2 64,32, 7,1 + + bdnz LDGEMM_L4x8_LOOP + MY_ALIGN +LDGEMM_L4x8_LOOP_END: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_3 64,32, 7,1 + + b LDGEMM_L4x8_SUB1 + MY_ALIGN +LDGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + KERNEL4x8 1 + + addic. L, L, -1 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 + MY_ALIGN +LDGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + ble LDGEMM_L4x8_SAVE + MY_ALIGN +LDGEMM_L4x8_SUB2: + + andi. T1,L, 8 + ble LDGEMM_L4x8_SUB2_4 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_3 64,32, 3,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_3 64,32, 1,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 64,32, 0,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x8_SAVE + KERNEL4x8 0 + + MY_ALIGN +LDGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 +#endif +LDGEMM_L4x8_END: + +LDGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x4_SUB4 + +LDGEMM_L4x4_LOOP_START: + + #dcbt AO, PRE + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -2 + ble LDGEMM_L4x4_LOOP_END + + MY_ALIGN + +LDGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -1 + bgt LDGEMM_L4x4_LOOP + +LDGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x4_SAVE + +LDGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SAVE: + + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 +#endif +LDGEMM_L4x4_END: + +LDGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x2_SUB4 + +LDGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble LDGEMM_L4x2_LOOP_END + + MY_ALIGN + +LDGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt LDGEMM_L4x2_LOOP + +LDGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x2_SAVE + +LDGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SAVE: + + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 +#endif +LDGEMM_L4x2_END: + +LDGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x1_SUB4 + +LDGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble LDGEMM_L4x1_LOOP_END + + MY_ALIGN + +LDGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt LDGEMM_L4x1_LOOP + +LDGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x1_SAVE + +LDGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SAVE: + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 +#endif +LDGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + addic. J, J, -1 + bgt LDGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +LDGEMM_L4_END: + + b LDGEMM_L2_BEGIN + +.L999_H1: + + b .L999 + +LDGEMM_L2_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 2 + ble LDGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble LDGEMM_L2x16_END + +LDGEMM_L2x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x16_SUB4 + +LDGEMM_L2x16_LOOP_START: + + #dcbt AO, PRE + LOAD2x16_1 + #dcbt AO, PRE + KERNEL2x16_I1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble LDGEMM_L2x16_LOOP_END + + MY_ALIGN + +LDGEMM_L2x16_LOOP: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt LDGEMM_L2x16_LOOP + +LDGEMM_L2x16_LOOP_END: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB4: + + #dcbt AO, PRE + KERNEL2x16_SUBI1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x16_SAVE + +LDGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SAVE: + + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt LDGEMM_L2x16_BEGIN + +LDGEMM_L2x16_END: + +LDGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L2x1_END + + andi. T1, M, 8 + ble LDGEMM_L2x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x8_SUB4 + +LDGEMM_L2x8_LOOP_START: + + #dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble LDGEMM_L2x8_LOOP_END + + MY_ALIGN + +LDGEMM_L2x8_LOOP: + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt LDGEMM_L2x8_LOOP + +LDGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x8_SAVE + +LDGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SAVE: + + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 +#endif +LDGEMM_L2x8_END: + +LDGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x4_SUB4 + +LDGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble LDGEMM_L2x4_LOOP_END + + MY_ALIGN + +LDGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt LDGEMM_L2x4_LOOP + +LDGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x4_SAVE + +LDGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SAVE: + + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 +#endif +LDGEMM_L2x4_END: + +LDGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x2_SUB4 + +LDGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble LDGEMM_L2x2_LOOP_END + + MY_ALIGN + +LDGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt LDGEMM_L2x2_LOOP + +LDGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x2_SAVE + +LDGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SAVE: + + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 +#endif +LDGEMM_L2x2_END: + +LDGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x1_SUB4 + +LDGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble LDGEMM_L2x1_LOOP_END + + MY_ALIGN + +LDGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt LDGEMM_L2x1_LOOP + +LDGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x1_SAVE + +LDGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SAVE: + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 +#endif +LDGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 1 + ble LDGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble LDGEMM_L1x16_END + +LDGEMM_L1x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x16_SUB4 + +LDGEMM_L1x16_LOOP_START: + + #dcbt AO, PRE + LOAD1x16_1 + #dcbt AO, PRE + KERNEL1x16_I1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble LDGEMM_L1x16_LOOP_END + + MY_ALIGN + +LDGEMM_L1x16_LOOP: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt LDGEMM_L1x16_LOOP + +LDGEMM_L1x16_LOOP_END: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB4: + + #dcbt AO, PRE + KERNEL1x16_SUBI1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x16_SAVE + +LDGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SAVE: + + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt LDGEMM_L1x16_BEGIN + +LDGEMM_L1x16_END: + +LDGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L1x1_END + + andi. T1, M, 8 + ble LDGEMM_L1x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x8_SUB4 + +LDGEMM_L1x8_LOOP_START: + + #dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble LDGEMM_L1x8_LOOP_END + + MY_ALIGN + +LDGEMM_L1x8_LOOP: + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt LDGEMM_L1x8_LOOP + +LDGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x8_SAVE + +LDGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SAVE: + + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 +#endif +LDGEMM_L1x8_END: + +LDGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x4_SUB4 + +LDGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble LDGEMM_L1x4_LOOP_END + + MY_ALIGN + +LDGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt LDGEMM_L1x4_LOOP + +LDGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x4_SAVE + +LDGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SAVE: + + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 +#endif +LDGEMM_L1x4_END: + +LDGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x2_SUB4 + +LDGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble LDGEMM_L1x2_LOOP_END + + MY_ALIGN + +LDGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt LDGEMM_L1x2_LOOP + +LDGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x2_SAVE + +LDGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SAVE: + + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 +#endif +LDGEMM_L1x2_END: + +LDGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x1_SUB4 + +LDGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble LDGEMM_L1x1_LOOP_END + + MY_ALIGN + +LDGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt LDGEMM_L1x1_LOOP + +LDGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x1_SAVE + +LDGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SAVE: + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 +#endif +LDGEMM_L1x1_END: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S index c4b8270b8..4eddab24f 100644 --- a/kernel/power/dgemm_macros_power9.S +++ b/kernel/power/dgemm_macros_power9.S @@ -1,3623 +1,3623 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - -/********************************************************************* -* Macros for N=4, M=16 * -*********************************************************************/ -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - xxlxor vs36,vs36,vs36 - xxlxor vs37,vs37,vs37 - xxlxor vs38,vs38,vs38 - xxlxor vs39,vs39,vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif -.endm - - -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -.macro KERNEL4x16_L1_L2 Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete - -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 -.else - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 -.endif - lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) - lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - -.else - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 -.endif - lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - - - - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - - - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) -.endif - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) - lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 -.if \Complete==0 - lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) -.endif - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) -.endif - xvmaddadp vs52, vs12, vs30 - xvmaddadp vs53, vs13, vs30 - xvmaddadp vs54, vs14, vs30 - xvmaddadp vs55, vs15, vs30 -.if \Complete==0 - lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) -.endif - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 - - - xvmaddadp vs60, vs12, vs31 - - xvmaddadp vs61, vs13, vs31 - xvmaddadp vs62, vs14, vs31 - - xvmaddadp vs63, vs15, vs31 - .if \IsLast==1 - .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) - .else - addi \AREG, \AREG, DISP32(\Index,256) - addi \BREG, \BREG, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x16 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) - - - - addi BO, BO, 32 - addi AO, AO, 128 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 - -.endif -.endm - -.macro SAVE4x16_REGS - add C2, CO, LDC - add C3, C2, LDC - add C4, C3, LDC -.endm - -.macro SAVE4x16 -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs24, 64(CO) - lxv vs26, 80(CO) - lxv vs28, 96(CO) - lxv vs30, 112(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C2) - lxv vs3, 16(C2) - lxv vs5, 32(C2) - lxv vs7, 48(C2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs25, 64(C2) - lxv vs27, 80(C2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 -#ifndef TRMMKERNEL - lxv vs29, 96(C2) - lxv vs31, 112(C2) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - xxpermdi vs8, vs44,vs36,1 - xxpermdi vs9 ,vs36,vs44,1 - xxpermdi vs10, vs45,vs37,1 - xxpermdi vs11 ,vs37,vs45,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - xxpermdi vs12, vs46,vs38,1 - xxpermdi vs13 ,vs38,vs46,1 - xxpermdi vs14, vs47,vs39,1 - xxpermdi vs15 ,vs39,vs47,1 - -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r - -#endif - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - stxv vs24, 64(CO) - stxv vs26, 80(CO) - stxv vs28, 96(CO) - stxv vs30, 112(CO) - - stxv vs1, 0(C2) - stxv vs3, 16(C2) - stxv vs5, 32(C2) - stxv vs7, 48(C2) - - stxv vs25, 64(C2) - stxv vs27, 80(C2) - stxv vs29, 96(C2) - stxv vs31, 112(C2) -#ifndef TRMMKERNEL - lxv vs0, 0(C3) - lxv vs2, 16(C3) - lxv vs4, 32(C3) - lxv vs6, 48(C3) -#endif - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs24, 64(C3) - lxv vs26, 80(C3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs28, 96(C3) - lxv vs30, 112(C3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C4) - lxv vs3, 16(C4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(C4) - lxv vs7, 48(C4) - - lxv vs25, 64(C4) - lxv vs27, 80(C4) - lxv vs29, 96(C4) - lxv vs31, 112(C4) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - - xxpermdi vs8, vs60,vs52,1 - xxpermdi vs9 ,vs52,vs60,1 - xxpermdi vs10, vs61,vs53,1 - xxpermdi vs11 ,vs53,vs61,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - - - xxpermdi vs12, vs62,vs54,1 - xxpermdi vs13 ,vs54,vs62,1 - xxpermdi vs14, vs63,vs55,1 - xxpermdi vs15 ,vs55,vs63,1 -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r -#endif - stxv vs0, 0(C3) - stxv vs2, 16(C3) - stxv vs4, 32(C3) - stxv vs6, 48(C3) - - stxv vs24, 64(C3) - stxv vs26, 80(C3) - stxv vs28, 96(C3) - stxv vs30, 112(C3) - - stxv vs1, 0(C4) - stxv vs3, 16(C4) - stxv vs5, 32(C4) - stxv vs7, 48(C4) - - stxv vs25, 64(C4) - stxv vs27, 80(C4) - stxv vs29, 96(C4) - stxv vs31, 112(C4) - - addi CO, CO, 128 -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - -.endif -.endm - - - -.macro KERNEL4x8_L1_L2 Index,IsLast - KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index,0+\OffsetA)(AO) - lxv vs9, DISP16(\Index,16+\OffsetA)(AO) -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - - lxv vs10, DISP16(\Index,32+\OffsetA)(AO) - lxv vs11, DISP16(\Index,48+\OffsetA)(AO) - - - -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - - lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) - lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(AO) - lxv vs1, DISP16(\Index,80+\OffsetA)(AO) -.endif - - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.if \Complete==0 - lxv vs2, DISP16(\Index,96+\OffsetA)(AO) - lxv vs3, DISP16(\Index,112+\OffsetA)(AO) -.endif - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) - lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) -.endif - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - - .if \IsLast==1 - .if \Complete==1 - addi AO, AO, DISP16(\Index,64+\OffsetA) - addi BO, BO, DISP8(\Index,32+\OffsetB) - .else - addi AO, AO, DISP16(\Index,128) - addi BO, BO, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x8 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - - - addi BO, BO, 32 - addi AO, AO, 64 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - -.endif -.endm - - - -.macro SAVE4x8 - add T2, CO, LDC - add T3, T2, LDC - add T4, T3, LDC -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T2) - lxv vs3, 16(T2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T2) - lxv vs7, 48(T2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 - - - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - - stxv vs1, 0(T2) - stxv vs3, 16(T2) - stxv vs5, 32(T2) - stxv vs7, 48(T2) - - - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs0, 0(T3) - lxv vs2, 16(T3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs4, 32(T3) - lxv vs6, 48(T3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T4) - lxv vs3, 16(T4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T4) - lxv vs7, 48(T4) - - - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(T3) - stxv vs2, 16(T3) - stxv vs4, 32(T3) - stxv vs6, 48(T3) - - - stxv vs1, 0(T4) - stxv vs3, 16(T4) - stxv vs5, 32(T4) - stxv vs7, 48(T4) - - - - addi CO, CO, 64 -.endm - - -/********************************************************************* -* Macros for N=4, M=4 * -*********************************************************************/ - -.macro LOAD4x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro SAVE4x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=4, M=2 * -*********************************************************************/ - -.macro LOAD4x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r -#else - xvmuldp vs0, vs48, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r -#else - xvmuldp vs8, vs56, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=4, M=1 * -*********************************************************************/ - -.macro LOAD4x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs48, alpha_r -#else - xsmuldp vs0, vs48, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs56, alpha_r -#else - xsmuldp vs8, vs56, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=2, M=16 * -*********************************************************************/ - -.macro LOAD2x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL2x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro SAVE2x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD2x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro SAVE2x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=2, M=4 * -*********************************************************************/ - -.macro LOAD2x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=2, M=2 * -*********************************************************************/ - -.macro LOAD2x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=2, M=1 * -*********************************************************************/ - -.macro LOAD2x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=1, M=16 * -*********************************************************************/ - -.macro LOAD1x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL1x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro SAVE1x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD1x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro SAVE1x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=1, M=4 * -*********************************************************************/ - -.macro LOAD1x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=1, M=2 * -*********************************************************************/ - -.macro LOAD1x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_E2 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=1, M=1 * -*********************************************************************/ - -.macro LOAD1x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_E2 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - addi CO, CO, 8 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + xxlxor vs36,vs36,vs36 + xxlxor vs37,vs37,vs37 + xxlxor vs38,vs38,vs38 + xxlxor vs39,vs39,vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + + +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro KERNEL4x16_L1_L2 Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete + +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 +.else + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 +.endif + lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) + lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + +.else + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 +.endif + lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) +.endif + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) + lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 +.if \Complete==0 + lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) +.endif + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) +.endif + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 +.if \Complete==0 + lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) +.endif + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + + xvmaddadp vs60, vs12, vs31 + + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + + xvmaddadp vs63, vs15, vs31 + .if \IsLast==1 + .if \Complete==1 + addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) + .else + addi \AREG, \AREG, DISP32(\Index,256) + addi \BREG, \BREG, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x16 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) + + + + addi BO, BO, 32 + addi AO, AO, 128 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endif +.endm + +.macro SAVE4x16_REGS + add C2, CO, LDC + add C3, C2, LDC + add C4, C3, LDC +.endm + +.macro SAVE4x16 +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs24, 64(CO) + lxv vs26, 80(CO) + lxv vs28, 96(CO) + lxv vs30, 112(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C2) + lxv vs3, 16(C2) + lxv vs5, 32(C2) + lxv vs7, 48(C2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs25, 64(C2) + lxv vs27, 80(C2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 +#ifndef TRMMKERNEL + lxv vs29, 96(C2) + lxv vs31, 112(C2) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + xxpermdi vs8, vs44,vs36,1 + xxpermdi vs9 ,vs36,vs44,1 + xxpermdi vs10, vs45,vs37,1 + xxpermdi vs11 ,vs37,vs45,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + xxpermdi vs12, vs46,vs38,1 + xxpermdi vs13 ,vs38,vs46,1 + xxpermdi vs14, vs47,vs39,1 + xxpermdi vs15 ,vs39,vs47,1 + +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r + +#endif + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + stxv vs24, 64(CO) + stxv vs26, 80(CO) + stxv vs28, 96(CO) + stxv vs30, 112(CO) + + stxv vs1, 0(C2) + stxv vs3, 16(C2) + stxv vs5, 32(C2) + stxv vs7, 48(C2) + + stxv vs25, 64(C2) + stxv vs27, 80(C2) + stxv vs29, 96(C2) + stxv vs31, 112(C2) +#ifndef TRMMKERNEL + lxv vs0, 0(C3) + lxv vs2, 16(C3) + lxv vs4, 32(C3) + lxv vs6, 48(C3) +#endif + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs24, 64(C3) + lxv vs26, 80(C3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs28, 96(C3) + lxv vs30, 112(C3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C4) + lxv vs3, 16(C4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(C4) + lxv vs7, 48(C4) + + lxv vs25, 64(C4) + lxv vs27, 80(C4) + lxv vs29, 96(C4) + lxv vs31, 112(C4) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + + xxpermdi vs8, vs60,vs52,1 + xxpermdi vs9 ,vs52,vs60,1 + xxpermdi vs10, vs61,vs53,1 + xxpermdi vs11 ,vs53,vs61,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + + + xxpermdi vs12, vs62,vs54,1 + xxpermdi vs13 ,vs54,vs62,1 + xxpermdi vs14, vs63,vs55,1 + xxpermdi vs15 ,vs55,vs63,1 +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r +#endif + stxv vs0, 0(C3) + stxv vs2, 16(C3) + stxv vs4, 32(C3) + stxv vs6, 48(C3) + + stxv vs24, 64(C3) + stxv vs26, 80(C3) + stxv vs28, 96(C3) + stxv vs30, 112(C3) + + stxv vs1, 0(C4) + stxv vs3, 16(C4) + stxv vs5, 32(C4) + stxv vs7, 48(C4) + + stxv vs25, 64(C4) + stxv vs27, 80(C4) + stxv vs29, 96(C4) + stxv vs31, 112(C4) + + addi CO, CO, 128 +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + +.endif +.endm + + + +.macro KERNEL4x8_L1_L2 Index,IsLast + KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index,0+\OffsetA)(AO) + lxv vs9, DISP16(\Index,16+\OffsetA)(AO) +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + + lxv vs10, DISP16(\Index,32+\OffsetA)(AO) + lxv vs11, DISP16(\Index,48+\OffsetA)(AO) + + + +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + + lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) + lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(AO) + lxv vs1, DISP16(\Index,80+\OffsetA)(AO) +.endif + + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.if \Complete==0 + lxv vs2, DISP16(\Index,96+\OffsetA)(AO) + lxv vs3, DISP16(\Index,112+\OffsetA)(AO) +.endif + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) + lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) +.endif + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + + .if \IsLast==1 + .if \Complete==1 + addi AO, AO, DISP16(\Index,64+\OffsetA) + addi BO, BO, DISP8(\Index,32+\OffsetB) + .else + addi AO, AO, DISP16(\Index,128) + addi BO, BO, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x8 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + + + addi BO, BO, 32 + addi AO, AO, 64 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + +.endif +.endm + + + +.macro SAVE4x8 + add T2, CO, LDC + add T3, T2, LDC + add T4, T3, LDC +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T2) + lxv vs3, 16(T2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T2) + lxv vs7, 48(T2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 + + + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + + stxv vs1, 0(T2) + stxv vs3, 16(T2) + stxv vs5, 32(T2) + stxv vs7, 48(T2) + + + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs0, 0(T3) + lxv vs2, 16(T3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs4, 32(T3) + lxv vs6, 48(T3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T4) + lxv vs3, 16(T4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T4) + lxv vs7, 48(T4) + + + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(T3) + stxv vs2, 16(T3) + stxv vs4, 32(T3) + stxv vs6, 48(T3) + + + stxv vs1, 0(T4) + stxv vs3, 16(T4) + stxv vs5, 32(T4) + stxv vs7, 48(T4) + + + + addi CO, CO, 64 +.endm + + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index bd74d20e5..58dcdec5a 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -1,328 +1,328 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - -#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code - -#if !defined(USE_MASK_PERMUTATIONS) - -static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgew %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgow %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { - - BLASLONG index; - BLASLONG i=0; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned int static_index0 = {0,1,2,3}; -#else - register __vector unsigned int static_index0 = {2,0,3,1}; -#endif - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0; - register __vector unsigned int static_index2=static_index0 +temp1; - register __vector unsigned int static_index3=static_index1 +temp1; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - - register __vector float * v_ptrx=(__vector float *)x; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; -#endif - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vv0,vf0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(vv0,quadruple_values); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the maximum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - - - - - - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - max = ciamax_kernel_32(n1, x, &maxf); - i = n1; - ix = n1 << 1; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); - - } else { - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (max + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code + +#if !defined(USE_MASK_PERMUTATIONS) + +static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgew %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgow %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i=0; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned int static_index0 = {0,1,2,3}; +#else + register __vector unsigned int static_index0 = {2,0,3,1}; +#endif + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0; + register __vector unsigned int static_index2=static_index0 +temp1; + register __vector unsigned int static_index3=static_index1 +temp1; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; +#endif + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 336766245..843370c6c 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -1,266 +1,266 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - float first_min=CABS1(x,0); - register __vector float quadruple_values={first_min,first_min,first_min,first_min}; - - register __vector float * v_ptrx=(__vector float *)x; - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vf0,vv0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(quadruple_values,vv0); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the minimum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = ciamin_kernel_32(n1, x, &minf); - i = n1; - ix = n1 << 1; - } - - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); - - } else { - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (min + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c index bf1af78d6..fb2dafec0 100644 --- a/kernel/power/isamax.c +++ b/kernel/power/isamax.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include - - -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - register __vector float * v_ptrx=(__vector float *)x; - for(; ii2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - max = siamax_kernel_64(n1, x, &maxf); - - i = n1; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c index 1c1f0ad78..60c843f58 100644 --- a/kernel/power/isamin.c +++ b/kernel/power/isamin.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; - register __vector float * v_ptrx=(__vector float *)x; - register __vector float quadruple_values=vec_abs(v_ptrx[0]); - for(; ii2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = siamin_kernel_64(n1, x, &minf); - i = n1; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index 7a0f3143e..5cdc83d87 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -1,272 +1,272 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs20 -#define save_permute_1 vs21 -#define save_permute_2 vs22 -#define permute_mask vs23 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define T11 r29 - -#define T12 r30 -#define T13 r31 - -#include "sgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_11, 0x1415161718191a1b -.equ save_permute_12, 0x0405060708090a0b -.equ save_permute_21, 0x101112131c1d1e1f -.equ save_permute_22, 0x000102030c0d0e0f - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - slwi LDC, LDC, 2 - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - lis T5, save_permute_22@highest - lis T6, save_permute_21@highest - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - ori T5, T5, save_permute_22@higher - ori T6, T6, save_permute_21@higher - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - rldicr T5, T5, 32, 31 - rldicr T6, T6, 32, 31 - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - oris T5, T5, save_permute_22@h - oris T6, T6, save_permute_21@h - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - ori T5, T5, save_permute_22@l - ori T6, T6, save_permute_21@l - li r0,0 - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - mtvsrdd save_permute_2,T5,T6 - -#include "sgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 + +#include "sgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index a34ed32b8..4022959e2 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,2192 +1,2192 @@ -#define MY_ALIGN .align 3 -b L8 - - MY_ALIGN -LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_2 - MY_ALIGN - -LSGEMM_L8x16_LOOP: - KERNEL8x16_L2 128,64,0,0 -LSGEMM_L8x16_K128: - KERNEL8x16_L2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64, 3,0 - KERNEL8x16_I1_L4_2 128,64, 4,0 - KERNEL8x16_I1_L4_2 128,64, 5,0 - KERNEL8x16_I1_L4_2 128,64, 6,0 - KERNEL8x16_I1_L4_2 128,64, 7,0 - KERNEL8x16_I1_L4_2 128,64, 8,0 - KERNEL8x16_I1_L4_2 128,64, 9,0 - KERNEL8x16_I1_L4_2 128,64, 10,0 - KERNEL8x16_I1_L4_2 128,64, 11,0 - KERNEL8x16_I1_L4_2 128,64, 12,0 - KERNEL8x16_I1_L4_2 128,64, 13,0 - KERNEL8x16_I1_L4_2 128,64, 14,0 - KERNEL8x16_I1_L4_2 128,64, 15,0 - KERNEL8x16_I1_L4_2 128,64, 16,0 - KERNEL8x16_I1_L4_2 128,64, 17,0 - KERNEL8x16_I1_L4_2 128,64, 18,0 - KERNEL8x16_I1_L4_2 128,64, 19,0 - KERNEL8x16_I1_L4_2 128,64, 20,0 - KERNEL8x16_I1_L4_2 128,64, 21,0 - KERNEL8x16_I1_L4_2 128,64, 22,0 - KERNEL8x16_I1_L4_2 128,64, 23,0 - KERNEL8x16_I1_L4_2 128,64, 24,0 - KERNEL8x16_I1_L4_2 128,64, 25,0 - KERNEL8x16_I1_L4_2 128,64, 26,0 - KERNEL8x16_I1_L4_2 128,64, 27,0 - KERNEL8x16_I1_L4_2 128,64, 28,0 - KERNEL8x16_I1_L4_2 128,64, 29,0 - KERNEL8x16_I1_L4_2 128,64, 30,0 - KERNEL8x16_I1_L4_2 128,64, 31,1 - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - END8x16_2 - blr - - MY_ALIGN -LSGEMM_L8x16_L64_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_2 128,64,7,0 - KERNEL8x16_I1_L4_2 128,64,8,0 - KERNEL8x16_I1_L4_2 128,64,9,0 - KERNEL8x16_I1_L4_2 128,64,10,0 - KERNEL8x16_I1_L4_2 128,64,11,0 - KERNEL8x16_I1_L4_2 128,64,12,0 - KERNEL8x16_I1_L4_2 128,64,13,0 - KERNEL8x16_I1_L4_2 128,64,14,0 - KERNEL8x16_I1_L4_3 128,64,15,1 - blr -LSGEMM_L8x16_L32_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_3 128,64,7,1 - blr - -LSGEMM_L8x16_L16_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_3 128,64,3,1 - blr - -L8: -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 3 - - ble LSGEMM_L8_END - -LSGEMM_L8_BEGIN: - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 3 - add C, C, T3 - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L8x16_END - - MY_ALIGN -LSGEMM_L8x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 - mr T12, T11 - addi T12,T12, -2 - srawi. L, T12, 7 /**(T11-2) % 128x */ -#else - mr T12, K - addi T12,T12, -2 - srawi. L, T12, 7 /**(K-2) % 128x */ -#endif - - ZERO8x16 - ble LSGEMM_L8x16_SUB0 - mtctr L - bl LSGEMM_L8x16_LMAIN_SUB - andi. L, T12, 127 - ble LSGEMM_L8x16_SAVE - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 255 - cmpwi T11,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T10,1 - bne CMP8x16_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD8x16 64,32 - END8x16_WITHOUT_ADD - LOAD8x16_2O AO,BO, 128, 64 - mtctr T10 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE -CMP8x16_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T11,128 -#else - cmpwi K,128 -#endif - bne LSGEMM_L8x16_SUB2 - MY_ALIGN - mtctr T10 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD8x16_2O AO,BO, 128,64 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE - MY_ALIGN -LSGEMM_L8x16_SUB2: - andi. T10,L,64 - ble LSGEMM_L8x16_SUB2_32 - bl LSGEMM_L8x16_L64_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_32: - andi. T10,L, 32 - ble LSGEMM_L8x16_SUB2_16 - bl LSGEMM_L8x16_L32_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L8x16_SUB2_8 - bl LSGEMM_L8x16_L16_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_3 128,64, 1,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_2 - KERNEL8x16_I1_L4_3 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_2 - KERNEL8x16_E2 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L8x16_SAVE - KERNEL8x16 0 - - - MY_ALIGN -LSGEMM_L8x16_SAVE: - SAVE8x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L8x16_BEGIN - MY_ALIGN -LSGEMM_L8x16_END: -LSGEMM_L8x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 8 - ble LSGEMM_L8x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x8 - ble LSGEMM_L8x8_SUB0 - - MY_ALIGN -LSGEMM_L8x8_LOOP_START: - - LOAD8x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x8_LOOP: - - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_2 32,32, 1,0 - KERNEL8x8_I1_L4_2 32,32, 2,0 - KERNEL8x8_I1_L4_2 32,32, 3,1 - - bdnz LSGEMM_L8x8_LOOP - - MY_ALIGN -LSGEMM_L8x8_LOOP_END: - - END8x8 0, AO, BO, 32, 32 - - b LSGEMM_L8x8_SUB1 - MY_ALIGN -LSGEMM_L8x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x8_SUB2 - MY_ALIGN -LSGEMM_L8x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x8_SAVE - MY_ALIGN -LSGEMM_L8x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x8_SUB2_LOOP: - LOAD8x8_0 - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_3 32,32, 1,1 - bdnz LSGEMM_L8x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x8_SUB2_2 - LOAD8x8_0 - KERNEL8x8_I1_L4_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x8_SUB2_1 - LOAD8x8_0 - KERNEL8x8_I1_L2_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x8_SAVE - KERNEL8x8 0 - - - MY_ALIGN -LSGEMM_L8x8_SAVE: - SAVE8x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 -#endif - MY_ALIGN -LSGEMM_L8x8_END: -LSGEMM_L8x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 4 - ble LSGEMM_L8x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x4 - ble LSGEMM_L8x4_SUB0 - - MY_ALIGN -LSGEMM_L8x4_LOOP_START: - - LOAD8x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x4_LOOP: - - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_2 16,32, 1,0 - KERNEL8x4_I1_L4_2 16,32, 2,0 - KERNEL8x4_I1_L4_2 16,32, 3,1 - - bdnz LSGEMM_L8x4_LOOP - - MY_ALIGN -LSGEMM_L8x4_LOOP_END: - - END8x4 0, AO, BO, 16, 32 - - b LSGEMM_L8x4_SUB1 - MY_ALIGN -LSGEMM_L8x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x4_SUB2 - MY_ALIGN -LSGEMM_L8x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x4_SAVE - MY_ALIGN -LSGEMM_L8x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x4_SUB2_LOOP: - LOAD8x4_0 - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_3 16,32, 1,1 - bdnz LSGEMM_L8x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x4_SUB2_2 - LOAD8x4_0 - KERNEL8x4_I1_L4_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x4_SUB2_1 - LOAD8x4_0 - KERNEL8x4_I1_L2_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x4_SAVE - KERNEL8x4 0 - - - MY_ALIGN -LSGEMM_L8x4_SAVE: - SAVE8x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 -#endif - MY_ALIGN -LSGEMM_L8x4_END: -LSGEMM_L8x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L8x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x2 - ble LSGEMM_L8x2_SUB0 - - MY_ALIGN -LSGEMM_L8x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x2_LOOP: - - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,0 - KERNEL8x2_2 0,0, 2,0 - KERNEL8x2_2 0,0, 3,1 - - bdnz LSGEMM_L8x2_LOOP - - MY_ALIGN -LSGEMM_L8x2_LOOP_END: - -LSGEMM_L8x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x2_SAVE - MY_ALIGN -LSGEMM_L8x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x2_SUB2_2 - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x2_SUB2_1 - KERNEL8x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x2_SAVE - KERNEL8x2 - - MY_ALIGN -LSGEMM_L8x2_SAVE: - SAVE8x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 -#endif - MY_ALIGN -LSGEMM_L8x2_END: -LSGEMM_L8x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L8x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x1 - ble LSGEMM_L8x1_SUB0 - - MY_ALIGN -LSGEMM_L8x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x1_LOOP: - - KERNEL8x1_4 0,0, 0,0 - KERNEL8x1_4 0,0, 1,1 - - bdnz LSGEMM_L8x1_LOOP - - MY_ALIGN -LSGEMM_L8x1_LOOP_END: - -LSGEMM_L8x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x1_SAVE - MY_ALIGN -LSGEMM_L8x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x1_SUB2_2 - KERNEL8x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x1_SUB2_1 - KERNEL8x1_2 - MY_ALIGN -LSGEMM_L8x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x1_SAVE - KERNEL8x1 - - MY_ALIGN -LSGEMM_L8x1_SAVE: - SAVE8x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 -#endif - MY_ALIGN -LSGEMM_L8x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 8 -#endif - addic. J, J, -1 - bgt LSGEMM_L8_BEGIN - - -LSGEMM_L8_END: - -/* b LSGEMM_L4_BEGIN*/ - andi. T1, N, 4 - ble LSGEMM_L4_END -LSGEMM_L4_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L4x16_END - - MY_ALIGN -LSGEMM_L4x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ -#endif - - ZERO4x16 - ble LSGEMM_L4x16_SUB0 - - MY_ALIGN -LSGEMM_L4x16_LOOP_START: - - LOAD4x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=16 - addi AO,AO,2112 - addi BO,BO,16 - - mtctr L - - MY_ALIGN - -LSGEMM_L4x16_LOOP: - - KERNEL4x16_I1_L4_2 -2048,0, 0,0 - KERNEL4x16_I1_L4_2 -2048,0, 1,0 - KERNEL4x16_I1_L4_2 -2048,0, 2,0 - KERNEL4x16_I1_L4_2 -2048,0, 3,0 - KERNEL4x16_I1_L4_2 -2048,0, 4,0 - KERNEL4x16_I1_L4_2 -2048,0, 5,0 - KERNEL4x16_I1_L4_2 -2048,0, 6,0 - KERNEL4x16_I1_L4_2 -2048,0, 7,0 - KERNEL4x16_I1_L4_2 -2048,0, 8,0 - KERNEL4x16_I1_L4_2 -2048,0, 9,0 - KERNEL4x16_I1_L4_2 -2048,0, 10,0 - KERNEL4x16_I1_L4_2 -2048,0, 11,0 - KERNEL4x16_I1_L4_2 -2048,0, 12,0 - KERNEL4x16_I1_L4_2 -2048,0, 13,0 - KERNEL4x16_I1_L4_2 -2048,0, 14,0 - KERNEL4x16_I1_L4_2 -2048,0, 15,1 - - bdnz LSGEMM_L4x16_LOOP - - MY_ALIGN -LSGEMM_L4x16_LOOP_END: - - END4x16 0, AO, BO, -2048, 0 - - b LSGEMM_L4x16_SUB1 - MY_ALIGN -LSGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 127 -#else - andi. L, K, 127 -#endif - b LSGEMM_L4x16_SUB2 - MY_ALIGN -LSGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L4x16_SAVE - MY_ALIGN -LSGEMM_L4x16_SUB2: - - srawi. T10,L, 5 - ble LSGEMM_L4x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L4x16_SUB2_LOOP: - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_2 64,16, 3,0 - KERNEL4x16_I1_L4_2 64,16, 4,0 - KERNEL4x16_I1_L4_2 64,16, 5,0 - KERNEL4x16_I1_L4_2 64,16, 6,0 - KERNEL4x16_I1_L4_3 64,16, 7,1 - bdnz LSGEMM_L4x16_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_3 64,16, 3,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_3 64,16, 1,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L4_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L4x16_SUB2 - - MY_ALIGN -LSGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L4x16_BEGIN - MY_ALIGN -LSGEMM_L4x16_END: -LSGEMM_L4x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 8 - ble LSGEMM_L4x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x8 - ble LSGEMM_L4x8_SUB0 - - MY_ALIGN -LSGEMM_L4x8_LOOP_START: - - LOAD4x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_2 32,16, 1,0 - KERNEL4x8_I1_L4_2 32,16, 2,0 - KERNEL4x8_I1_L4_2 32,16, 3,1 - - bdnz LSGEMM_L4x8_LOOP - - MY_ALIGN -LSGEMM_L4x8_LOOP_END: - - END4x8 0, AO, BO, 32, 16 - - b LSGEMM_L4x8_SUB1 - MY_ALIGN -LSGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x8_SUB2 - MY_ALIGN -LSGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x8_SAVE - MY_ALIGN -LSGEMM_L4x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x8_SUB2_LOOP: - LOAD4x8_0 - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_3 32,16, 1,1 - bdnz LSGEMM_L4x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L4_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x8_SAVE - KERNEL4x8 0 - - - MY_ALIGN -LSGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 -#endif - MY_ALIGN -LSGEMM_L4x8_END: -LSGEMM_L4x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 4 - ble LSGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x4 - ble LSGEMM_L4x4_SUB0 - - MY_ALIGN -LSGEMM_L4x4_LOOP_START: - - LOAD4x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x4_LOOP: - - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_2 16,16, 1,0 - KERNEL4x4_I1_L4_2 16,16, 2,0 - KERNEL4x4_I1_L4_2 16,16, 3,1 - - bdnz LSGEMM_L4x4_LOOP - - MY_ALIGN -LSGEMM_L4x4_LOOP_END: - - END4x4 0, AO, BO, 16, 16 - - b LSGEMM_L4x4_SUB1 - MY_ALIGN -LSGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x4_SUB2 - MY_ALIGN -LSGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x4_SAVE - MY_ALIGN -LSGEMM_L4x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x4_SUB2_LOOP: - LOAD4x4_0 - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_3 16,16, 1,1 - bdnz LSGEMM_L4x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x4_SUB2_2 - LOAD4x4_0 - KERNEL4x4_I1_L4_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x4_SUB2_1 - LOAD4x4_0 - KERNEL4x4_I1_L2_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x4_SAVE - KERNEL4x4 0 - - - MY_ALIGN -LSGEMM_L4x4_SAVE: - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 -#endif - MY_ALIGN -LSGEMM_L4x4_END: -LSGEMM_L4x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L4x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x2 - ble LSGEMM_L4x2_SUB0 - - MY_ALIGN -LSGEMM_L4x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x2_LOOP: - - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,0 - KERNEL4x2_2 0,0, 2,0 - KERNEL4x2_2 0,0, 3,1 - - bdnz LSGEMM_L4x2_LOOP - - MY_ALIGN -LSGEMM_L4x2_LOOP_END: - -LSGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x2_SAVE - MY_ALIGN -LSGEMM_L4x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x2_SUB2_2 - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x2_SUB2_1 - KERNEL4x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -LSGEMM_L4x2_SAVE: - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 -#endif - MY_ALIGN -LSGEMM_L4x2_END: -LSGEMM_L4x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x1 - ble LSGEMM_L4x1_SUB0 - - MY_ALIGN -LSGEMM_L4x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x1_LOOP: - - KERNEL4x1_4 0,0, 0,0 - KERNEL4x1_4 0,0, 1,1 - - bdnz LSGEMM_L4x1_LOOP - - MY_ALIGN -LSGEMM_L4x1_LOOP_END: - -LSGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x1_SAVE - MY_ALIGN -LSGEMM_L4x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x1_SUB2_2 - KERNEL4x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x1_SUB2_1 - KERNEL4x1_2 - MY_ALIGN -LSGEMM_L4x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -LSGEMM_L4x1_SAVE: - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 -#endif - MY_ALIGN -LSGEMM_L4x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - - andi. T2, N, 3 - ble .L999 - -LSGEMM_L4_END: - andi. T1, N, 2 - ble LSGEMM_L2_END -LSGEMM_L2_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 1 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L2x16_END - - MY_ALIGN -LSGEMM_L2x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x16 - ble LSGEMM_L2x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x16_LOOP: - - KERNEL2x16_4 -2048,0, 0,0 - KERNEL2x16_4 -2048,0, 1,0 - KERNEL2x16_4 -2048,0, 2,0 - KERNEL2x16_4 -2048,0, 3,0 - KERNEL2x16_4 -2048,0, 4,0 - KERNEL2x16_4 -2048,0, 5,0 - KERNEL2x16_4 -2048,0, 6,0 - KERNEL2x16_4 -2048,0, 7,0 - KERNEL2x16_4 -2048,0, 8,0 - KERNEL2x16_4 -2048,0, 9,0 - KERNEL2x16_4 -2048,0, 10,0 - KERNEL2x16_4 -2048,0, 11,0 - KERNEL2x16_4 -2048,0, 12,0 - KERNEL2x16_4 -2048,0, 13,0 - KERNEL2x16_4 -2048,0, 14,0 - KERNEL2x16_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x16_SAVE - MY_ALIGN -LSGEMM_L2x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x16_SUB2_16 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,0 - KERNEL2x16_4 0,0, 4,0 - KERNEL2x16_4 0,0, 5,0 - KERNEL2x16_4 0,0, 6,0 - KERNEL2x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x16_SUB2_8 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x16_SUB2_4 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x16_SUB2_2 - KERNEL2x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x16_SUB2_1 - KERNEL2x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x16_SAVE - KERNEL2x16 - - MY_ALIGN -LSGEMM_L2x16_SAVE: - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L2x16_BEGIN - MY_ALIGN -LSGEMM_L2x16_END: - andi. I, M, 8 - ble LSGEMM_L2x8_END - - MY_ALIGN -LSGEMM_L2x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x8 - ble LSGEMM_L2x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x8_LOOP: - - KERNEL2x8_4 -2048,0, 0,0 - KERNEL2x8_4 -2048,0, 1,0 - KERNEL2x8_4 -2048,0, 2,0 - KERNEL2x8_4 -2048,0, 3,0 - KERNEL2x8_4 -2048,0, 4,0 - KERNEL2x8_4 -2048,0, 5,0 - KERNEL2x8_4 -2048,0, 6,0 - KERNEL2x8_4 -2048,0, 7,0 - KERNEL2x8_4 -2048,0, 8,0 - KERNEL2x8_4 -2048,0, 9,0 - KERNEL2x8_4 -2048,0, 10,0 - KERNEL2x8_4 -2048,0, 11,0 - KERNEL2x8_4 -2048,0, 12,0 - KERNEL2x8_4 -2048,0, 13,0 - KERNEL2x8_4 -2048,0, 14,0 - KERNEL2x8_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x8_SAVE - MY_ALIGN -LSGEMM_L2x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x8_SUB2_16 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,0 - KERNEL2x8_4 0,0, 4,0 - KERNEL2x8_4 0,0, 5,0 - KERNEL2x8_4 0,0, 6,0 - KERNEL2x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x8_SUB2_8 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x8_SUB2_4 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x8_SUB2_2 - KERNEL2x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x8_SUB2_1 - KERNEL2x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -LSGEMM_L2x8_SAVE: - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 -#endif - MY_ALIGN -LSGEMM_L2x8_END: - andi. I, M, 4 - ble LSGEMM_L2x4_END - - MY_ALIGN -LSGEMM_L2x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x4 - ble LSGEMM_L2x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x4_LOOP: - - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,0 - KERNEL2x4_4 0,0, 8,0 - KERNEL2x4_4 0,0, 9,0 - KERNEL2x4_4 0,0, 10,0 - KERNEL2x4_4 0,0, 11,0 - KERNEL2x4_4 0,0, 12,0 - KERNEL2x4_4 0,0, 13,0 - KERNEL2x4_4 0,0, 14,0 - KERNEL2x4_4 0,0, 15,1 - - bdnz LSGEMM_L2x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x4_SAVE - MY_ALIGN -LSGEMM_L2x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x4_SUB2_16 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x4_SUB2_8 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x4_SUB2_4 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x4_SUB2_2 - KERNEL2x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x4_SUB2_1 - KERNEL2x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x4_SAVE - KERNEL2x4 - - MY_ALIGN -LSGEMM_L2x4_SAVE: - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 -#endif - MY_ALIGN -LSGEMM_L2x4_END: - andi. I, M, 2 - ble LSGEMM_L2x2_END - - MY_ALIGN -LSGEMM_L2x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x2 - ble LSGEMM_L2x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x2_LOOP: - - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,0 - KERNEL2x2_4 0,0, 8,0 - KERNEL2x2_4 0,0, 9,0 - KERNEL2x2_4 0,0, 10,0 - KERNEL2x2_4 0,0, 11,0 - KERNEL2x2_4 0,0, 12,0 - KERNEL2x2_4 0,0, 13,0 - KERNEL2x2_4 0,0, 14,0 - KERNEL2x2_4 0,0, 15,1 - - bdnz LSGEMM_L2x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x2_SAVE - MY_ALIGN -LSGEMM_L2x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x2_SUB2_16 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x2_SUB2_8 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x2_SUB2_4 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x2_SUB2_2 - KERNEL2x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x2_SUB2_1 - KERNEL2x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -LSGEMM_L2x2_SAVE: - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 -#endif - MY_ALIGN -LSGEMM_L2x2_END: - andi. I, M, 1 - ble LSGEMM_L2x1_END - - MY_ALIGN -LSGEMM_L2x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x1 - ble LSGEMM_L2x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x1_LOOP: - - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,0 - KERNEL2x1_4 0,0, 8,0 - KERNEL2x1_4 0,0, 9,0 - KERNEL2x1_4 0,0, 10,0 - KERNEL2x1_4 0,0, 11,0 - KERNEL2x1_4 0,0, 12,0 - KERNEL2x1_4 0,0, 13,0 - KERNEL2x1_4 0,0, 14,0 - KERNEL2x1_4 0,0, 15,1 - - bdnz LSGEMM_L2x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x1_SAVE - MY_ALIGN -LSGEMM_L2x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x1_SUB2_16 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x1_SUB2_8 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x1_SUB2_4 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x1_SUB2_2 - KERNEL2x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x1_SUB2_1 - KERNEL2x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -LSGEMM_L2x1_SAVE: - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 -#endif - MY_ALIGN -LSGEMM_L2x1_END: - slwi T1, K, 3 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LSGEMM_L2_END: - andi. T1, N, 1 - ble LSGEMM_END -LSGEMM_1_BEGIN: - - - mr AO, A - mr CO, C - add C, C, LDC - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_1x16_END - - MY_ALIGN -LSGEMM_1x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x16 - ble LSGEMM_1x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x16_LOOP: - - KERNEL1x16_4 -2048,0, 0,0 - KERNEL1x16_4 -2048,0, 1,0 - KERNEL1x16_4 -2048,0, 2,0 - KERNEL1x16_4 -2048,0, 3,0 - KERNEL1x16_4 -2048,0, 4,0 - KERNEL1x16_4 -2048,0, 5,0 - KERNEL1x16_4 -2048,0, 6,0 - KERNEL1x16_4 -2048,0, 7,0 - KERNEL1x16_4 -2048,0, 8,0 - KERNEL1x16_4 -2048,0, 9,0 - KERNEL1x16_4 -2048,0, 10,0 - KERNEL1x16_4 -2048,0, 11,0 - KERNEL1x16_4 -2048,0, 12,0 - KERNEL1x16_4 -2048,0, 13,0 - KERNEL1x16_4 -2048,0, 14,0 - KERNEL1x16_4 -2048,0, 15,1 - - bdnz LSGEMM_1x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x16_SAVE - MY_ALIGN -LSGEMM_1x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x16_SUB2_16 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,0 - KERNEL1x16_4 0,0, 4,0 - KERNEL1x16_4 0,0, 5,0 - KERNEL1x16_4 0,0, 6,0 - KERNEL1x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x16_SUB2_8 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x16_SUB2_4 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x16_SUB2_2 - KERNEL1x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x16_SUB2_1 - KERNEL1x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x16_SAVE - KERNEL1x16 - - MY_ALIGN -LSGEMM_1x16_SAVE: - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt+ LSGEMM_1x16_BEGIN - MY_ALIGN -LSGEMM_1x16_END: - andi. I, M, 8 - ble LSGEMM_1x8_END - - MY_ALIGN -LSGEMM_1x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x8 - ble LSGEMM_1x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x8_LOOP: - - KERNEL1x8_4 -2048,0, 0,0 - KERNEL1x8_4 -2048,0, 1,0 - KERNEL1x8_4 -2048,0, 2,0 - KERNEL1x8_4 -2048,0, 3,0 - KERNEL1x8_4 -2048,0, 4,0 - KERNEL1x8_4 -2048,0, 5,0 - KERNEL1x8_4 -2048,0, 6,0 - KERNEL1x8_4 -2048,0, 7,0 - KERNEL1x8_4 -2048,0, 8,0 - KERNEL1x8_4 -2048,0, 9,0 - KERNEL1x8_4 -2048,0, 10,0 - KERNEL1x8_4 -2048,0, 11,0 - KERNEL1x8_4 -2048,0, 12,0 - KERNEL1x8_4 -2048,0, 13,0 - KERNEL1x8_4 -2048,0, 14,0 - KERNEL1x8_4 -2048,0, 15,1 - - bdnz LSGEMM_1x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x8_SAVE - MY_ALIGN -LSGEMM_1x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x8_SUB2_16 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,0 - KERNEL1x8_4 0,0, 4,0 - KERNEL1x8_4 0,0, 5,0 - KERNEL1x8_4 0,0, 6,0 - KERNEL1x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x8_SUB2_8 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x8_SUB2_4 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x8_SUB2_2 - KERNEL1x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x8_SUB2_1 - KERNEL1x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x8_SAVE - KERNEL1x8 - - MY_ALIGN -LSGEMM_1x8_SAVE: - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 -#endif - MY_ALIGN -LSGEMM_1x8_END: - andi. I, M, 4 - ble LSGEMM_1x4_END - - MY_ALIGN -LSGEMM_1x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x4 - ble LSGEMM_1x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x4_LOOP: - - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,0 - KERNEL1x4_4 0,0, 8,0 - KERNEL1x4_4 0,0, 9,0 - KERNEL1x4_4 0,0, 10,0 - KERNEL1x4_4 0,0, 11,0 - KERNEL1x4_4 0,0, 12,0 - KERNEL1x4_4 0,0, 13,0 - KERNEL1x4_4 0,0, 14,0 - KERNEL1x4_4 0,0, 15,1 - - bdnz LSGEMM_1x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x4_SAVE - MY_ALIGN -LSGEMM_1x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x4_SUB2_16 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x4_SUB2_8 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x4_SUB2_4 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x4_SUB2_2 - KERNEL1x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x4_SUB2_1 - KERNEL1x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x4_SAVE - KERNEL1x4 - - MY_ALIGN -LSGEMM_1x4_SAVE: - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 -#endif - MY_ALIGN -LSGEMM_1x4_END: - andi. I, M, 2 - ble LSGEMM_1x2_END - - MY_ALIGN -LSGEMM_1x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x2 - ble LSGEMM_1x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x2_LOOP: - - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,0 - KERNEL1x2_4 0,0, 8,0 - KERNEL1x2_4 0,0, 9,0 - KERNEL1x2_4 0,0, 10,0 - KERNEL1x2_4 0,0, 11,0 - KERNEL1x2_4 0,0, 12,0 - KERNEL1x2_4 0,0, 13,0 - KERNEL1x2_4 0,0, 14,0 - KERNEL1x2_4 0,0, 15,1 - - bdnz LSGEMM_1x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x2_SAVE - MY_ALIGN -LSGEMM_1x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x2_SUB2_16 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x2_SUB2_8 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x2_SUB2_4 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x2_SUB2_2 - KERNEL1x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x2_SUB2_1 - KERNEL1x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x2_SAVE - KERNEL1x2 - - MY_ALIGN -LSGEMM_1x2_SAVE: - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 -#endif - MY_ALIGN -LSGEMM_1x2_END: - andi. I, M, 1 - ble LSGEMM_1x1_END - - MY_ALIGN -LSGEMM_1x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x1 - ble LSGEMM_1x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x1_LOOP: - - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,0 - KERNEL1x1_16 0,0, 2,0 - KERNEL1x1_16 0,0, 3,1 - - bdnz LSGEMM_1x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x1_SAVE - MY_ALIGN -LSGEMM_1x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x1_SUB2_16 - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,1 - MY_ALIGN -LSGEMM_1x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x1_SUB2_8 - KERNEL1x1_16 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x1_SUB2_4 - KERNEL1x1_8 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x1_SUB2_2 - KERNEL1x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x1_SUB2_1 - KERNEL1x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x1_SAVE - KERNEL1x1 - - MY_ALIGN -LSGEMM_1x1_SAVE: - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 -#endif - MY_ALIGN -LSGEMM_1x1_END: - slwi T1, K, 2 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif +#define MY_ALIGN .align 3 +b L8 + + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_2 + MY_ALIGN + +LSGEMM_L8x16_LOOP: + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16_2 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 + blr + +L8: +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ +#else + mr T12, K + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + mtctr L + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 255 + cmpwi T11,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else + cmpwi K,128 +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 + ble LSGEMM_L8x16_SUB2_16 + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + bl LSGEMM_L8x16_L16_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 + + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 2c9e537c7..3750d338d 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -1,5575 +1,5575 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 4 -#define DISP64(ind,disp) (ind*unit_size*64+disp) -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -/********************************************************************************************** -* Macros for N=8 and M=16 -**********************************************************************************************/ - - - -.macro KERNEL8x16_L1_L4 Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - -.macro LOAD8x16 OffsetA,OffsetB - - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endm - -.macro END8x16_NORMAL - END8x16 0, AO, BO, 64,32 -.endm - -.macro END8x16_WITHOUT_ADD - END8x16 0, AO,BO,0,0 -.endm - -.macro END8x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.endif -.endm - -.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - -KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete - -.endm - -.macro KERNEL8x16 First - - LOAD8x16 0,0 - END8x16 \First, AO, BO, 64,32 -.endm - -.macro LOAD8x16_2 - LOAD8x16_2O AO,BO, 0,0 -.endm - -.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB - lxv vs8, (\OffsetB)(\BREG) - lxv vs12, (16+\OffsetB)(\BREG) - lxv vs24, (32+\OffsetB)(\BREG) - lxv vs28, (32+16+\OffsetB)(\BREG) - lxv vs4, (0+\OffsetA)(\AREG) - lxv vs5, (16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(\AREG) - lxv vs7, (48+\OffsetA)(\AREG) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(\AREG) - lxv vs1, (64+16+\OffsetA)(\AREG) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(\AREG) - lxv vs3, (64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - -.macro END8x16_2 - /*for load2 offset will be 128 and 64*/ - KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.if \Complete==0 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - -.if \Complete==0 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,\OffsetB) - addi \AREG, \AREG, DISP32(\Index,\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - - -.macro SAVE8x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - - - - /* permute to restore butterfly rank 1 updateto normal promoted one */ - /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ - /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ - /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ - /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) -#endif - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 -#ifndef TRMMKERNEL - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) -#endif - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - - - -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 -#ifndef TRMMKERNEL - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 -#ifndef TRMMKERNEL - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - - - - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - - stxv vs32, 0(CO) - stxv vs33, 16(CO) -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - stxv vs34, 32(CO) - stxv vs35, 48(CO) -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T1) - stxv vs37, 16(T1) -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - stxv vs38, 32(T1) - stxv vs39, 48(T1) - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - - stxv vs40, 0(T2) - stxv vs41, 16(T2) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T2) - stxv vs43, 48(T2) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - stxv vs44, 0(T3) - stxv vs45, 16(T3) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - /*****the same with the second 8X8 ****/ - #ifndef TRMMKERNEL - lxv vs32, 0(T4) - lxv vs33, 16(T4) -#endif - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs34, 32(T4) - lxv vs35, 48(T4) -#endif - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs36, 0(T5) - lxv vs37, 16(T5) -#endif - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 -#ifndef TRMMKERNEL - lxv vs38,32(T5) - lxv vs39, 48(T5) -#endif - - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 -#ifndef TRMMKERNEL - lxv vs40, 0(T6) - lxv vs41, 16(T6) -#endif - xxmrglw vs16, vs50, vs62 - xxmrglw vs18, vs54, vs58 -#ifndef TRMMKERNEL - lxv vs42, 32(T6) - lxv vs43, 48(T6) -#endif - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - xxmrghw vs4, vs54, vs58 - xxmrghw vs5, vs50, vs62 -#ifndef TRMMKERNEL - lxv vs44, 0(T7) - lxv vs45, 16(T7) -#endif - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 -#ifndef TRMMKERNEL - lxv vs46, 32(T7) - lxv vs47, 48(T7) -#endif - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 - - - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - #ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - stxv vs32, 0(T4) - stxv vs33, 16(T4) - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - stxv vs34, 32(T4) - stxv vs35, 48(T4) - -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T5) - stxv vs37, 16(T5) - -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - - - stxv vs38, 32(T5) - stxv vs39, 48(T5) - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T6) - stxv vs43, 48(T6) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - - stxv vs44, 0(T7) - stxv vs45, 16(T7) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - - stxv vs46, 32(T7) - stxv vs47, 48(T7) - - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=8 and M=8 -**********************************************************************************************/ - -.macro LOAD8x8_1 - LOAD8x8 1 -.endm - -.macro LOAD8x8_0 - LOAD8x8 0 -.endm - -.macro KERNEL8x8_L1_L4 Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END8x8_NORMAL - END8x8 0, AO, BO, 32,32 -.endm - -.macro Zero8X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - -.endm - -.macro LOAD8x8 Zero - - lxv vs24, 0(BO) - lxv vs28, 16(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endif -.endm - - -.macro END8x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.endm - -.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - -.if \Complete==0 - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endm - -.macro KERNEL8x8 First - - LOAD8x8 0 - END8x8 \First, AO, BO, 32,32 -.endm - -.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endif - -.endm - - -.macro SAVE8x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - lxv vs50, 0(T4) - lxv vs51, 16(T4) - lxv vs54, 0(T5) - lxv vs55, 16(T5) - lxv vs58, 0(T6) - lxv vs59, 16(T6) - lxv vs62, 0(T7) - lxv vs63, 16(T7) -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 - - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 - stxv vs34, 0(CO) - stxv vs35, 16(CO) - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 - stxv vs38, 0(T1) - stxv vs39, 16(T1) - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 - stxv vs42, 0(T2) - stxv vs43, 16(T2) - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - stxv vs46, 0(T3) - stxv vs47, 16(T3) - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - #ifdef TRMMKERNEL - xvmulsp vs50, vs8, alpha_r - xvmulsp vs51, vs12, alpha_r - xvmulsp vs54, vs9, alpha_r - xvmulsp vs55, vs13, alpha_r - xvmulsp vs58, vs10, alpha_r - xvmulsp vs59, vs14, alpha_r - xvmulsp vs62, vs11, alpha_r - xvmulsp vs63, vs15, alpha_r -#else - xvmaddasp vs50, vs8, alpha_r - xvmaddasp vs51, vs12, alpha_r - xvmaddasp vs54, vs9, alpha_r - xvmaddasp vs55, vs13, alpha_r - xvmaddasp vs58, vs10, alpha_r - xvmaddasp vs59, vs14, alpha_r - xvmaddasp vs62, vs11, alpha_r - xvmaddasp vs63, vs15, alpha_r -#endif - - stxv vs50, 0(T4) - stxv vs51, 16(T4) - stxv vs54, 0(T5) - stxv vs55, 16(T5) - stxv vs58, 0(T6) - stxv vs59, 16(T6) - stxv vs62, 0(T7) - stxv vs63, 16(T7) - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=4 -**********************************************************************************************/ - -.macro LOAD8x4_1 - LOAD8x4 1 -.endm - -.macro LOAD8x4_0 - LOAD8x4 0 -.endm - -.macro KERNEL8x4_L1_L4 Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - -.endm - -.macro LOAD8x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - lxv vs25, 16(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 -.endif -.endm - -.macro END8x4_NORMAL - END8x4 0, AO, BO, 16,32 -.endm - -.macro END8x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.endif -.endm - -.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - -.macro KERNEL8x4 First - LOAD8x4 0 - END8x4 \First, AO, BO, 16,32 -.endm - -.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - xvmulsp vs48, vs27, vs4 - xvmulsp vs49, vs27, vs5 - xvmulsp vs50, vs27, vs6 - xvmulsp vs51, vs27, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - - -.macro SAVE8x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - add T4, T2, T10 - add T5, T3, T10 -#if !defined(TRMMKERNEL) - lxv vs40, 0(T4) - lxv vs41, 0(T5) -#endif - add T6, T4, T10 - add T7, T5, T10 -#if !defined(TRMMKERNEL) - lxv vs42, 0(T6) - lxv vs43, 0(T7) -#endif - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - - xxmrglw vs0, vs51,vs48 - xxmrglw vs1, vs50,vs49 - xxmrglw vs4, vs48,vs51 - xxmrglw vs5, vs49,vs50 - - xxmrghw vs2, vs51,vs48 - xxmrghw vs3, vs50,vs49 - xxmrghw vs6, vs48,vs51 - xxmrghw vs7, vs49,vs50 - - xxmrgld vs28, vs1, vs0 - xxmrghd vs29,vs5,vs4 - - xxmrgld vs30, vs2, vs3 - xxmrghd vs31,vs6,vs7 -#if defined(TRMMKERNEL) - - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r - xvmulsp vs40, vs28, alpha_r - xvmulsp vs41, vs29, alpha_r - xvmulsp vs42, vs30, alpha_r - xvmulsp vs43, vs31, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - xvmaddasp vs40, vs28, alpha_r - xvmaddasp vs41, vs29, alpha_r - xvmaddasp vs42, vs30, alpha_r - xvmaddasp vs43, vs31, alpha_r -#endif - - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - stxv vs40, 0(T4) - stxv vs41, 0(T5) - stxv vs42, 0(T6) - stxv vs43, 0(T7) - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=2 -**********************************************************************************************/ - - -.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero8x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - -.endm - -.macro KERNEL8x2 - KERNEL8x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP8(\Index,32) - -.endm - -.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs1, vs29, vs10 - xvmulsp vs2, vs28, vs11 - xvmulsp vs3, vs29, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs1, vs29, vs10 - xvmaddasp vs2, vs28, vs11 - xvmaddasp vs3, vs29, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE8x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - lxssp v8,0(T4) - lxssp v9,4(T4) - - lxssp v10,0(T5) - lxssp v11,4(T5) - - lxssp v12,0(T6) - lxssp v13,4(T6) - - lxssp v14,0(T7) - lxssp v15,4(T7) -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - xscvspdp vs9, vs3 - xxspltw vs10, vs3, 1 - xxspltw vs11, vs3, 2 - xxspltw vs12, vs3, 3 - xscvspdp vs10,vs10 - xscvspdp vs11,vs11 - xscvspdp vs12,vs12 - - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 - - - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - xsmuldp vs40,vs12, vs4 - xsmuldp vs41,vs31, vs4 - - xsmuldp vs42,vs11, vs4 - xsmuldp vs43,vs30, vs4 - - xsmuldp vs44,vs10, vs4 - xsmuldp vs45,vs29, vs4 - - xsmuldp vs46,vs9, vs4 - xsmuldp vs47,vs28, vs4 -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - xsmaddadp vs40,vs12, vs4 - xsmaddadp vs41,vs31, vs4 - - xsmaddadp vs42,vs11, vs4 - xsmaddadp vs43,vs30, vs4 - - xsmaddadp vs44,vs10, vs4 - xsmaddadp vs45,vs29, vs4 - - xsmaddadp vs46,vs9, vs4 - xsmaddadp vs47,vs28, vs4 -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - stxssp v8,0(T4) - stxssp v9,4(T4) - - stxssp v10,0(T5) - stxssp v11,4(T5) - - stxssp v12,0(T6) - stxssp v13,4(T6) - - stxssp v14,0(T7) - stxssp v15,4(T7) - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=1 -**********************************************************************************************/ -.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero8x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 -.endm - -.macro KERNEL8x1 - KERNEL8x1_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_2 - KERNEL8x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL8x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) - lxv vs28, 32(\BREG) - lxv vs29, 48(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 64 -.endm - -.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) - lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) - lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) - lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) - lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs1, vs31, vs10 - xvmulsp vs0, vs32, vs11 - xvmulsp vs1, vs33, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs0, vs32, vs11 - xvmaddasp vs1, vs33, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP32(\Index,128) -.endif -.endm - -.macro SAVE8x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) - lxssp v8,0(T4) - lxssp v10,0(T5) - lxssp v12,0(T6) - lxssp v14,0(T7) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 - xsmuldp vs40,vs31, vs4 - xsmuldp vs42,vs30, vs4 - xsmuldp vs44,vs29, vs4 - xsmuldp vs46,vs28, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 - xsmaddadp vs40,vs31, vs4 - xsmaddadp vs42,vs30, vs4 - xsmaddadp vs44,vs29, vs4 - xsmaddadp vs46,vs28, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - stxssp v8,0(T4) - stxssp v10,0(T5) - stxssp v12,0(T6) - stxssp v14,0(T7) - addi CO,CO,4 -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=16 -**********************************************************************************************/ - -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm - -.macro KERNEL4x16_L1_L4 Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - -.endif -.endm - -.macro END4x16_NORMAL - END4x16 0, AO, BO, 64,16 -.endm - -.macro END4x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - -.endif -.endm - -.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endm - -.macro KERNEL4x16 First - - LOAD4x16 0 - END4x16 \First, AO, BO, 64,16 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endif - -.endm - - -.macro SAVE4x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) - stxv vs38, 32(T1) - stxv vs39, 48(T1) - - stxv vs40, 0(T2) - stxv vs41, 16(T2) - stxv vs42, 32(T2) - stxv vs43, 48(T2) - stxv vs44, 0(T3) - stxv vs45, 16(T3) - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=8 -**********************************************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm - -.macro KERNEL4x8_L1_L4 Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END4x8_NORMAL - END4x8 0, AO, BO, 32,16 -.endm - -.macro Zero4X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endm - -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endif -.endm - - -.macro END4x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.endm - -.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - -.endm - -.macro KERNEL4x8 First - - LOAD4x8 0 - END4x8 \First, AO, BO, 32,16 -.endm - -.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - -.endif - -.endm - - -.macro SAVE4x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - stxv vs34, 0(CO) - stxv vs35, 16(CO) - stxv vs38, 0(T1) - stxv vs39, 16(T1) - stxv vs42, 0(T2) - stxv vs43, 16(T2) - stxv vs46, 0(T3) - stxv vs47, 16(T3) - - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=4 -**********************************************************************************************/ - -.macro LOAD4x4_1 - LOAD4x4 1 -.endm - -.macro LOAD4x4_0 - LOAD4x4 0 -.endm - -.macro KERNEL4x4_L1_L4 Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - -.macro LOAD4x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endif -.endm - -.macro END4x4_NORMAL - END4x4 0, AO, BO, 16,16 -.endm - -.macro END4x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.endif -.endm - -.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - -.macro KERNEL4x4 First - LOAD4x4 0 - END4x4 \First, AO, BO, 16,16 -.endm - -.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) - -.endif -.endif - - -.endm - - -.macro SAVE4x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - #if defined(TRMMKERNEL) - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - #endif - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=2 -**********************************************************************************************/ - - -.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero4x2 - xxlxor vs0, vs0, vs0 - xxlxor vs2, vs2, vs2 - -.endm - -.macro KERNEL4x2 - KERNEL4x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP4(\Index,16) - -.endm - -.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs2, vs28, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs2, vs28, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE4x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=1 -**********************************************************************************************/ -.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero4x1 - xxlxor vs0, vs0, vs0 -.endm - -.macro KERNEL4x1 - KERNEL4x1_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_2 - KERNEL4x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 16 -.endm - -.macro KERNEL4x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs28, 16(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs0, vs32, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs0, vs32, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif -.endm - -.macro SAVE4x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - addi CO,CO,4 -.endm - -/****************************N=2 section*****************/ - -.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - xxlxor vs6, vs6, vs6 - xxlxor vs7, vs7, vs7 -.endm - -.macro KERNEL2x16 - KERNEL2x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - xvmulsp vs6, vs28, vs9 - xvmulsp vs7, vs29, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs2, vs32, vs12 - xvmaddasp vs3, vs33, vs12 - - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - xvmaddasp vs6, vs32, vs13 - xvmaddasp vs7, vs33, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs2, vs36, vs14 - xvmaddasp vs3, vs37, vs14 - - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - xvmaddasp vs6, vs36, vs15 - xvmaddasp vs7, vs37, vs15 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE2x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - lxv vs28, 32(T1) - lxv vs29, 48(T1) -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r - xvmulsp vs28, vs6, alpha_r - xvmulsp vs29, vs7, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r - xvmaddasp vs28, vs6, alpha_r - xvmaddasp vs29, vs7, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - stxv vs28, 32(T1) - stxv vs29, 48(T1) - - addi CO,CO,64 - -.endm - -/* M=8 N=2 */ - -.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x8 - KERNEL2x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE2x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - - addi CO,CO,32 - -.endm - - -/*M=4*/ - - -.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - /* we will aggregate on save vs0 +vs4 vs11+vs5 */ -.macro Zero2x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x4 - KERNEL2x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs30, vs13 - xvmaddasp vs4, vs34, vs14 - xvmaddasp vs5, vs34, vs15 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE2x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - -#endif - /*aggregate vectors*/ - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs26, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs26, vs1, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs26, 0(T1) - - addi CO,CO,16 - -.endm - - -/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ -.macro SWITCH_PERMUTE_INNER - xxpermdi permute_mask, permute_mask, permute_mask,2 -.endm - -.macro Zero2x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - SWITCH_PERMUTE_INNER -.endm - -.macro KERNEL2x2 - KERNEL2x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxperm vs9, vs36, permute_mask - lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs37, vs36 - xvmulsp vs1, vs37, vs9 - -.else - xvmaddasp vs0, vs37, vs36 - xvmaddasp vs1, vs37, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP2(\Index,8) - -.endm - - - - -.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - xxperm vs11, vs10, permute_mask - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs16, vs11 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - -.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP4(\Index,16) -.endif -.endm - - -.macro SAVE2x2 - -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) - -#endif - /*aggregate vectors*/ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - /* */ - /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ - xxperm vs1,vs1, permute_mask - - - xxmrghw vs2 ,vs1,vs0 - xxpermdi vs2,vs2,vs2,2 - xxmrghw vs3 ,vs0,vs1 -#if defined(TRMMKERNEL) - xvmulsp vs36, vs2, alpha_r - xvmulsp vs37, vs3, alpha_r -#else - xvmaddasp vs36, vs2, alpha_r - xvmaddasp vs37, vs3, alpha_r -#endif - /**** store last two words*/ - - - stxsd v4, 0(CO) - stxsd v5, 0(T1) - - addi CO,CO,8 - -.endm - -/*--------------------------- M=1 N=2 */ -.macro Zero2x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL2x1 - KERNEL2x1_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs2, vs37, vs35 - xvmulsp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP2(\Index,8) -.endm - - -.macro SAVE2x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxssp v5 , 0(T1) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 2x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 0(T1) - - addi CO,CO,4 - -.endm - - - -/****************************N=1 section*****************/ - -.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x16 - KERNEL1x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs2, vs32, vs10 - xvmaddasp vs3, vs33, vs10 - - - xvmaddasp vs0, vs34, vs11 - xvmaddasp vs1, vs35, vs11 - xvmaddasp vs2, vs36, vs11 - xvmaddasp vs3, vs37, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE1x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - addi CO,CO,64 - -.endm - -/* M=8 N=1 */ - -.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x8 - KERNEL1x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - - - xvmaddasp vs2, vs34, vs11 - xvmaddasp vs3, vs35, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE1x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - /* aggregate vs0 vs2 and vs1 vs3*/ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - addi CO,CO,32 - -.endm -/*M=4*/ - -.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x4 - KERNEL1x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - - xvmaddasp vs1, vs27, vs9 - - xvmaddasp vs2, vs30, vs10 - - - xvmaddasp vs3, vs31, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE1x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - /* aggregate */ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 - xvaddsp vs0,vs1,vs0 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r -#endif - stxv vs16, 0(CO) - - addi CO,CO,16 - -.endm - -/* M=2 N=1*/ -.macro Zero1x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL1x2 - KERNEL1x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs2, vs37, vs35 - xvmuldp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x2 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - lxssp v5 , 4(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 1x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 4(CO) - - addi CO,CO,8 - -.endm -/*///////////////// N=1 M=1 //////////////////*/ -.macro Zero1x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2,vs2 - xxlxor vs3,vs3,vs3 - xxlxor vs4,vs4,vs4 -.endm - -.macro KERNEL1x1 - KERNEL1x1_1 AO,BO, 1, 0,0,0 -.endm - -.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone ( FIRST==1 to zero vs4) - */ -.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs4, vs37, vs35 - -.else - xsmaddadp vs4, vs37, vs35 - .endif - - addi \AREG, \AREG, DISP1(\Index,4) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - -.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) - lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) - lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) - lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) - lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - xvmaddasp vs2, vs10, vs17 - xvmaddasp vs3, vs11, vs18 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs8, vs26 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) - lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs36, vs37 - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors */ - xvaddsp vs0,vs0,vs1 - xvaddsp vs2,vs2,vs3 - xvaddsp vs0,vs0,vs2 - - xxpermdi vs7,vs0,vs0,2 - xvaddsp vs0,vs0,vs7 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs7,vs5,vs6 - xsadddp vs4,vs4,vs7 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs4, vs16 - -#else - xsmaddadp vs36,vs4, vs16 -#endif - - stxssp v4, 0(CO) - - addi CO,CO,4 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 3 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 2 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + + + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 OffsetA,OffsetB + + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0,0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm + +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 +#ifndef TRMMKERNEL + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + + + + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + stxv vs38, 32(T1) + stxv vs39, 48(T1) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + + stxv vs40, 0(T2) + stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T2) + stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T3) + stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ + #ifndef TRMMKERNEL + lxv vs32, 0(T4) + lxv vs33, 16(T4) +#endif + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) +#endif + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + + + stxv vs38, 32(T5) + stxv vs39, 48(T5) + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + stxv vs40, 0(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T6) + stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + + stxv vs44, 0(T7) + stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + +.if \Complete==0 + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 5dfb18f5b..f5c1ba729 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -1,470 +1,470 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_n.c" - -#else - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float v_x4 = {x4,x4,x4,x4}; - __vector float v_x5 = {x5,x5,x5,x5}; - __vector float v_x6 = {x6,x6,x6,x6}; - __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i++) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i++ ) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - } - -} - - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); -} - -#endif - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" + +#else + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float v_x4 = {x4,x4,x4,x4}; + __vector float v_x5 = {x5,x5,x5,x5}; + __vector float v_x6 = {x6,x6,x6,x6}; + __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i++) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i++ ) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + } + +} + + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + +#endif + diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index 64696236a..0edb79129 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -1,514 +1,514 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could provide barebone for switching to inline assembly -*/ - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - register __vector float v_x0 = {x0,x0,x0,x0}; - register __vector float v_x1 = {x1,x1,x1,x1}; - register __vector float v_x2 = {x2,x2,x2,x2}; - register __vector float v_x3 = {x3,x3,x3,x3}; - register __vector float v_x4 = {x4,x4,x4,x4}; - register __vector float v_x5 = {x5,x5,x5,x5}; - register __vector float v_x6 = {x6,x6,x6,x6}; - register __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i+=2) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float vb0_1=vb0[i] ; - register __vector float vb0_2=vb0[i+1] ; - register __vector float vb1_1=vb1[i] ; - register __vector float vb1_2=vb1[i+1] ; - register __vector float vb2_1=vb2[i] ; - register __vector float vb2_2=vb2[i+1] ; - register __vector float vb3_1=vb3[i] ; - register __vector float vb3_2=vb3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i+=2 ) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } - -} - - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 7 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - - if ( m3 & 4 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - if ( lda == 4 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; - temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; - - temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; - temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; - temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; - temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; - - a_ptr += 16; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0] ; - a_ptr +=4; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - y_ptr += inc_y; - y_ptr[0] += alpha * temp3; - y_ptr += inc_y; - a += 4; - } - - - if ( m3 & 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - a += 2; - } - - if ( m3 & 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - - - } - - - return(0); -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could provide barebone for switching to inline assembly +*/ + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + register __vector float v_x0 = {x0,x0,x0,x0}; + register __vector float v_x1 = {x1,x1,x1,x1}; + register __vector float v_x2 = {x2,x2,x2,x2}; + register __vector float v_x3 = {x3,x3,x3,x3}; + register __vector float v_x4 = {x4,x4,x4,x4}; + register __vector float v_x5 = {x5,x5,x5,x5}; + register __vector float v_x6 = {x6,x6,x6,x6}; + register __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i+=2) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float vb0_1=vb0[i] ; + register __vector float vb0_2=vb0[i+1] ; + register __vector float vb1_1=vb1[i] ; + register __vector float vb1_2=vb1[i+1] ; + register __vector float vb2_1=vb2[i] ; + register __vector float vb2_2=vb2[i+1] ; + register __vector float vb3_1=vb3[i] ; + register __vector float vb3_2=vb3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i+=2 ) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } + +} + + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 7 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + + if ( m3 & 4 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + if ( lda == 4 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; + temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; + + temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; + temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; + temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; + temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; + + a_ptr += 16; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0] ; + a_ptr +=4; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + y_ptr += inc_y; + y_ptr[0] += alpha * temp3; + y_ptr += inc_y; + a += 4; + } + + + if ( m3 & 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + a += 2; + } + + if ( m3 & 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + + + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 62c517a9d..c3fc8e77a 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -1,484 +1,484 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_t.c" - -#else - -#include "common.h" - -#define NBMAX 2048 - -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 3 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - aj += 3; - } - - } else { - - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr += inc_y; - aj += lda; - } - - } - - } - return (0); - } - - if (m3 == 2) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - return (0); - - } - - FLOAT xtemp = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - } - - return (0); - -} - -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else + +#include "common.h" + +#define NBMAX 2048 + +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i] ; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + +#endif diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index b90512162..1ee7c8aeb 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -1,508 +1,508 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could be used as base for switching to inline assembly -*/ - -#include "common.h" -#include -#define NBMAX 4096 - -#include - -static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i +=2) { - register __vector float vx1=v_x[i] ; - register __vector float vx2=v_x[i+1] ; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float va4_1=va4[i] ; - register __vector float va4_2=va4[i+1] ; - register __vector float va5_1=va5[i] ; - register __vector float va5_2=va5[i+1] ; - register __vector float va6_1=va6[i] ; - register __vector float va6_2=va6[i+1] ; - register __vector float va7_1=va7[i] ; - register __vector float va7_2=va7[i+1] ; - temp0 += vx1* va0_1 + vx2 * va0_2; - temp1 += vx1* va1_1 + vx2 * va1_2; - temp2 += vx1* va2_1 + vx2 * va2_2; - temp3 += vx1* va3_1 + vx2 * va3_2; - temp4 += vx1* va4_1 + vx2 * va4_2; - temp5 += vx1* va5_1 + vx2 * va5_2; - temp6 += vx1* va6_1 + vx2 * va6_2; - temp7 += vx1* va7_1 + vx2 * va7_2; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 7; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 & 4) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp3 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 4 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; - y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; - y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; - aj += 16; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - aj += 4; - } - - } else if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; - y_ptr += inc_y; - aj += lda; - } - - } - if (m3==4) return (0); - a_ptr += 4; - } - - if (m3 & 2 ) { - - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - if (m3==2) return (0); - a_ptr += 2; - } - if (m3 & 1) { - - FLOAT xtemp = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - - } - a_ptr += 1; - } - return (0); - -} - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could be used as base for switching to inline assembly +*/ + +#include "common.h" +#include +#define NBMAX 4096 + +#include + +static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i +=2) { + register __vector float vx1=v_x[i] ; + register __vector float vx2=v_x[i+1] ; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float va4_1=va4[i] ; + register __vector float va4_2=va4[i+1] ; + register __vector float va5_1=va5[i] ; + register __vector float va5_2=va5[i+1] ; + register __vector float va6_1=va6[i] ; + register __vector float va6_2=va6[i+1] ; + register __vector float va7_1=va7[i] ; + register __vector float va7_2=va7[i+1] ; + temp0 += vx1* va0_1 + vx2 * va0_2; + temp1 += vx1* va1_1 + vx2 * va1_2; + temp2 += vx1* va2_1 + vx2 * va2_2; + temp3 += vx1* va3_1 + vx2 * va3_2; + temp4 += vx1* va4_1 + vx2 * va4_2; + temp5 += vx1* va5_1 + vx2 * va5_2; + temp6 += vx1* va6_1 + vx2 * va6_2; + temp7 += vx1* va7_1 + vx2 * va7_2; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + } + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 7; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 & 4) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp3 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 4 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; + y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; + y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; + aj += 16; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + aj += 4; + } + + } else if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; + y_ptr += inc_y; + aj += lda; + } + + } + if (m3==4) return (0); + a_ptr += 4; + } + + if (m3 & 2 ) { + + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + if (m3==2) return (0); + a_ptr += 2; + } + if (m3 & 1) { + + FLOAT xtemp = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + + } + a_ptr += 1; + } + return (0); + +} + diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index d1e60da6c..f9320d516 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -1,245 +1,245 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - -#define LOAD ld - -#define STACKSIZE 512 - -#define FZERO 312+192(SP) - -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ - -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - - -#define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 - -#define VECSAVE r11 - -#define FRAMEPOINTER r12 - -#define T10 r14 - -#define L r15 -#define T8 r16 -#define T5 r17 -#define T2 r19 -#define TEMP_REG r20 -#define T6 r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T7 r27 -#define T3 r28 -#define T4 r29 - -#define PRE r30 -#define T1 r31 - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - mflr r0 - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - xxspltd alpha_r,vs1,0 /*copy from register f1 */ - xxspltd alpha_i,vs2,0 /*copy from register f2 */ - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - std r0, FLINK_SAVE(SP) - - -#if defined(linux) || defined(__FreeBSD__) - ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) -#endif - - -#ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) -#endif -#endif - - -#include "zgemm_macros_power9.S" - - - - slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li r0, 0 - - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegdp alpha_r,alpha_r - xvnegdp alpha_i,alpha_i -#endif - .align 4 - -#include "zgemm_logic_power9.S" - -L999: - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE #endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index fe5d8ade2..850b41aff 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -1,1891 +1,1891 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define MY_ALIGN .align 3 -b ZGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -ZGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 -ZGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_L2 256,64,31,0 - KERNEL2x8_L2 256,64,32,0 - KERNEL2x8_L2 256,64,33,0 - KERNEL2x8_L2 256,64,34,0 - KERNEL2x8_L2 256,64,35,0 - KERNEL2x8_L2 256,64,36,0 - KERNEL2x8_L2 256,64,37,0 - KERNEL2x8_L2 256,64,38,0 - KERNEL2x8_L2 256,64,39,0 - KERNEL2x8_L2 256,64,40,0 - KERNEL2x8_L2 256,64,41,0 - KERNEL2x8_L2 256,64,42,0 - KERNEL2x8_L2 256,64,43,0 - KERNEL2x8_L2 256,64,44,0 - KERNEL2x8_L2 256,64,45,0 - KERNEL2x8_L2 256,64,46,0 - KERNEL2x8_L2 256,64,47,0 - KERNEL2x8_L2 256,64,48,0 - KERNEL2x8_L2 256,64,49,0 - KERNEL2x8_L2 256,64,50,0 - KERNEL2x8_L2 256,64,51,0 - KERNEL2x8_L2 256,64,52,0 - KERNEL2x8_L2 256,64,53,0 - KERNEL2x8_L2 256,64,54,0 - KERNEL2x8_L2 256,64,55,0 - KERNEL2x8_L2 256,64,56,0 - KERNEL2x8_L2 256,64,57,0 - KERNEL2x8_L2 256,64,58,0 - KERNEL2x8_L2 256,64,59,0 - KERNEL2x8_L2 256,64,60,0 - KERNEL2x8_L2 256,64,61,0 - KERNEL2x8_L2 256,64,62,0 - KERNEL2x8_L2 256,64,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -ZGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_E2 256,64,31,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_E2 256,64,15,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_E2 256,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -ZGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,0,0 -ZGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_L2 128,64,7,0 - KERNEL2x4_L2 128,64,8,0 - KERNEL2x4_L2 128,64,9,0 - KERNEL2x4_L2 128,64,10,0 - KERNEL2x4_L2 128,64,11,0 - KERNEL2x4_L2 128,64,12,0 - KERNEL2x4_L2 128,64,13,0 - KERNEL2x4_L2 128,64,14,0 - KERNEL2x4_L2 128,64,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -ZGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_E2 128,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_E2 128,64,3,1 - blr - - -ZGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -ZGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,0,0 -ZGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_L2 64,64,7,0 - KERNEL2x2_L2 64,64,8,0 - KERNEL2x2_L2 64,64,9,0 - KERNEL2x2_L2 64,64,10,0 - KERNEL2x2_L2 64,64,11,0 - KERNEL2x2_L2 64,64,12,0 - KERNEL2x2_L2 64,64,13,0 - KERNEL2x2_L2 64,64,14,0 - KERNEL2x2_L2 64,64,15,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN - - -ZGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -ZGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_E2 64,64,7,1 - blr - MY_ALIGN -ZGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_E2 64,64,3,1 - blr - - -ZGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -ZGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,0,0 -ZGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_L2 32,64,7,0 - KERNEL2x1_L2 32,64,8,0 - KERNEL2x1_L2 32,64,9,0 - KERNEL2x1_L2 32,64,10,0 - KERNEL2x1_L2 32,64,11,0 - KERNEL2x1_L2 32,64,12,0 - KERNEL2x1_L2 32,64,13,0 - KERNEL2x1_L2 32,64,14,0 - KERNEL2x1_L2 32,64,15,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -ZGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_E2 32,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_E2 32,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -ZGEMM_L2: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 1 - ble ZGEMM_L2_END - - -ZGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - - -ZGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8O 128,32 - END2x8_WITHOUT_ADD - LOAD2x8_2O 256, 64 - mtctr T8 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-256 - LOAD2x8_2O 256,64 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - MY_ALIGN - - -ZGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_L2 256,64, 1,0 - KERNEL2x8_L2 256,64, 2,0 - KERNEL2x8_E2 256,64, 3,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_E2 256,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 256,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 - - -ZGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN - - -ZGEMM_L2x8_END: -/*----------------------------------------*/ - - -ZGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble ZGEMM_L2x4_SUB0 - bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 - - -ZGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4O 64,32 - END2x4_WITHOUT_ADD - LOAD2x4_2O 128, 64 - mtctr T8 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD2x4_2O 128,64 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x4_SUB2_4 - bl ZGEMM_2x4_L8_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 128,64, 0,0 - KERNEL2x4_E2 128,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 128,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 - - -ZGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -ZGEMM_L2x4_END: -/*----------------------------------------*/ - - -ZGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble ZGEMM_L2x2_SUB0 - bl ZGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 - - -ZGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2O 32,32 - END2x2_WITHOUT_ADD - LOAD2x2_2O 64, 64 - mtctr T8 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD2x2_2O 64,64 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x2_SUB2_8 - bl ZGEMM_2x2_L16_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x2_SUB2_4 - bl ZGEMM_2x2_L8_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 64,64, 0,0 - KERNEL2x2_E2 64,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 64,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 - - -ZGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -ZGEMM_L2x2_END: -/*----------------------------------------*/ - - -ZGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble ZGEMM_L2x1_SUB0 - bl ZGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 - - -ZGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1O 16,32 - END2x1_WITHOUT_ADD - LOAD2x1_2O 32, 64 - mtctr T8 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD2x1_2O 32,64 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x1_SUB2_8 - bl ZGEMM_2x1_L16_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x1_SUB2_4 - bl ZGEMM_2x1_L8_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 32,64, 0,0 - KERNEL2x1_E2 32,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 32,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 - - -ZGEMM_L2x1_SAVE: -/*----------------------------------------*/ - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -ZGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - bgt ZGEMM_L2_BEGIN - - -ZGEMM_L2_END: - -b ZGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -ZGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 -ZGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_L2 256,32,31,0 - KERNEL1x8_L2 256,32,32,0 - KERNEL1x8_L2 256,32,33,0 - KERNEL1x8_L2 256,32,34,0 - KERNEL1x8_L2 256,32,35,0 - KERNEL1x8_L2 256,32,36,0 - KERNEL1x8_L2 256,32,37,0 - KERNEL1x8_L2 256,32,38,0 - KERNEL1x8_L2 256,32,39,0 - KERNEL1x8_L2 256,32,40,0 - KERNEL1x8_L2 256,32,41,0 - KERNEL1x8_L2 256,32,42,0 - KERNEL1x8_L2 256,32,43,0 - KERNEL1x8_L2 256,32,44,0 - KERNEL1x8_L2 256,32,45,0 - KERNEL1x8_L2 256,32,46,0 - KERNEL1x8_L2 256,32,47,0 - KERNEL1x8_L2 256,32,48,0 - KERNEL1x8_L2 256,32,49,0 - KERNEL1x8_L2 256,32,50,0 - KERNEL1x8_L2 256,32,51,0 - KERNEL1x8_L2 256,32,52,0 - KERNEL1x8_L2 256,32,53,0 - KERNEL1x8_L2 256,32,54,0 - KERNEL1x8_L2 256,32,55,0 - KERNEL1x8_L2 256,32,56,0 - KERNEL1x8_L2 256,32,57,0 - KERNEL1x8_L2 256,32,58,0 - KERNEL1x8_L2 256,32,59,0 - KERNEL1x8_L2 256,32,60,0 - KERNEL1x8_L2 256,32,61,0 - KERNEL1x8_L2 256,32,62,0 - KERNEL1x8_L2 256,32,63,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -ZGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_E2 256,32,31,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_E2 256,32,15,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_E2 256,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN - - -ZGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,0,0 - - -ZGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_L2 128,32,7,0 - KERNEL1x4_L2 128,32,8,0 - KERNEL1x4_L2 128,32,9,0 - KERNEL1x4_L2 128,32,10,0 - KERNEL1x4_L2 128,32,11,0 - KERNEL1x4_L2 128,32,12,0 - KERNEL1x4_L2 128,32,13,0 - KERNEL1x4_L2 128,32,14,0 - KERNEL1x4_L2 128,32,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN - - -ZGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -ZGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_E2 128,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_E2 128,32,3,1 - blr - - -ZGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN - - -ZGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,0,0 - - -ZGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_L2 64,32,7,0 - KERNEL1x2_L2 64,32,8,0 - KERNEL1x2_L2 64,32,9,0 - KERNEL1x2_L2 64,32,10,0 - KERNEL1x2_L2 64,32,11,0 - KERNEL1x2_L2 64,32,12,0 - KERNEL1x2_L2 64,32,13,0 - KERNEL1x2_L2 64,32,14,0 - KERNEL1x2_L2 64,32,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN - - -ZGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN - - -ZGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_E2 64,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_E2 64,32,3,1 - blr - - -ZGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN - - -ZGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,0,0 - - -ZGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_L2 32,32,7,0 - KERNEL1x1_L2 32,32,8,0 - KERNEL1x1_L2 32,32,9,0 - KERNEL1x1_L2 32,32,10,0 - KERNEL1x1_L2 32,32,11,0 - KERNEL1x1_L2 32,32,12,0 - KERNEL1x1_L2 32,32,13,0 - KERNEL1x1_L2 32,32,14,0 - KERNEL1x1_L2 32,32,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN - - -ZGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - MY_ALIGN - - -ZGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_E2 32,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_E2 32,32,3,1 - blr - - -/*----------------------N1 BEGINS---------*/ -ZGEMM_L1: -/*----------------------------------------*/ - andi. T1, N, 1 - ble ZGEMM_L1_END - -ZGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - bl ZGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 - - -ZGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8O 128,16 - END1x8_WITHOUT_ADD - LOAD1x8_2O 256, 32 - mtctr T8 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-256 - LOAD1x8_2O 256,32 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - MY_ALIGN - - -ZGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L1x8_SUB2_32 - bl ZGEMM_1x8_L64_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L1x8_SUB2_16 - bl ZGEMM_1x8_L32_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x8_SUB2_8 - bl ZGEMM_1x8_L16_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_L2 256,32, 1,0 - KERNEL1x8_L2 256,32, 2,0 - KERNEL1x8_E2 256,32, 3,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_E2 256,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 256,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - - -ZGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt ZGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - b ZGEMM_L1x4_BEGIN - MY_ALIGN - - -ZGEMM_L1x8_END: -/*----------------------------------------*/ - - -ZGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - bl ZGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 - - -ZGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4O 64,16 - END1x4_WITHOUT_ADD - LOAD1x4_2O 128, 32 - mtctr T8 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD1x4_2O 128,32 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x4_SUB2_8 - bl ZGEMM_1x4_L16_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x4_SUB2_4 - bl ZGEMM_1x4_L8_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 128,32, 0,0 - KERNEL1x4_E2 128,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 128,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 - - -ZGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -ZGEMM_L1x4_END: -/*----------------------------------------*/ - - -ZGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - bl ZGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 - - -ZGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2O 32,16 - END1x2_WITHOUT_ADD - LOAD1x2_2O 64, 32 - mtctr T8 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD1x2_2O 64,32 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x2_SUB2_8 - bl ZGEMM_1x2_L16_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x2_SUB2_4 - bl ZGEMM_1x2_L8_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 64,32, 0,0 - KERNEL1x2_E2 64,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 64,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 - - -ZGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -ZGEMM_L1x2_END: -/*----------------------------------------*/ - - -ZGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - bl ZGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 - - -ZGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1O 16,16 - END1x1_WITHOUT_ADD - LOAD1x1_2O 32, 32 - mtctr T8 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD1x1_2O 32,32 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x1_SUB2_8 - bl ZGEMM_1x1_L16_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x1_SUB2_4 - bl ZGEMM_1x1_L8_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 32,32, 0,0 - KERNEL1x1_E2 32,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 32,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 - - -ZGEMM_L1x1_SAVE: -/*----------------------------------------*/ - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -ZGEMM_L1x1_END: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - - -ZGEMM_L1_END: -/*----------------------------------------*/ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +ZGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +ZGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + bl ZGEMM_2x2_L8_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + bl ZGEMM_2x1_L8_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN + + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + bl ZGEMM_1x4_L8_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + bl ZGEMM_1x2_L8_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x1_SUB2_4 + bl ZGEMM_1x1_L8_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 8670e9574..68024b826 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -1,1825 +1,1825 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 16 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) -/* HELPERS FOR SAVE */ -/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ - - -.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET -#ifndef TRMMKERNEL - lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) - lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) - xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 - xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 -#endif -.endm -/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ - - -.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ -.endm -/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ - - -.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ -.endm -/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ - - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead instead to fix sign*/ - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm -/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ - - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 -#ifndef TRMMKERNEL - xvmsubadp \VSOUT1,\VSINII, alpha_i - xvmaddadp \VSOUT2,\VSINRR, alpha_i -#else - xvmuldp \VSOUT1,\VSINII, alpha_i - xvmuldp \VSOUT2,\VSINRR, alpha_i -#endif -.endm -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubadp \VSOUT1,\VSINRR, alpha_r - xvmaddadp \VSOUT2,\VSINII, alpha_r -.endm -/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ - - -.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrghd \VSOUT1,\VSIN2,\VSIN1 - xxmrgld \VSOUT2,\VSIN2,\VSIN1 -.endm - - -.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 - stxv \VSIN1, DISPX(\LOFFSET)(\REG) - stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) -.endm - - -.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 - MULT_APLHA_PART1 vs6,vs8,vs16,vs17 - MULT_APLHA_PART2 vs2,vs4,vs14,vs15 - AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - MULT_APLHA_PART1 vs10,vs12, vs24,vs25 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - MULT_APLHA_PART2 vs10,vs12,vs24,vs25 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 - MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 - UNPACK_FOR_STORE vs24,vs25,vs10,vs12 - UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 - STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 - STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 -.endm - - -.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART1 vs6,vs8, vs16,vs17 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 -.endm - - - -.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 -.endm - - - -.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 -#ifndef TRMMKERNEL - lxv vs18, (\LOFFSET)(\BASE_REG) - xxmrgld vs14,vs18,vs18 - xxmrghd vs15,vs18,vs18 -#endif - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - xxmrghd vs7,vs15,vs14 - stxv vs7, (\LOFFSET)(\BASE_REG) -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,128,32 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x8_2 - /*for load2 offset will be 256 and 64*/ - KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 -.endm - - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs48, vs8, vs22 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs49, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs50, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs51, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs52, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs54, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs55, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs56, vs12, vs22 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs58, vs13, vs22 - xvmaddadp vs43, vs13, vs21 - xvmaddadp vs59, vs13, vs23 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs60, vs14, vs22 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs62, vs15, vs22 - xvmaddadp vs47, vs15, vs21 - xvmaddadp vs63, vs15, vs23 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 128,32 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,64,32 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 - -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_2 - /*for load2 offset will be 128 and 64*/ - KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs41, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs43, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs47, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 64,32 -.endm - - - -.macro SAVE2x4 - add T1, CO ,LDC - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,32,32 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 - -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_2 - /*for load2 offset will be 64 and 64*/ - KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 -.endm - - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs37, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs39, vs9, vs23 -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 32,32 -.endm - - - -.macro SAVE2x2 - add T1, CO ,LDC - SAVE2 vs32,vs33,vs34,vs35,CO,0 - SAVE2 vs36,vs37,vs38,vs39,T1,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,16,32 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_2 - /*for load2 offset will be 32 and 64*/ - KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 -.endm - - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs35, vs8, vs23 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 16,32 -.endm - - - -.macro SAVE2x1 - add T1, CO ,LDC - SAVE1 vs32,vs33,CO,0 - SAVE1 vs34,vs35,T1,0 - addi CO, CO, 16 -.endm - -/********************************************************************************************** -* - -.macros for N=1 and M=8 -**********************************************************************************************/ - - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,128,16 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 - -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x8_2 - /*for load2 offset will be 256 and 32*/ - KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 -.endm - - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs43, vs13, vs21 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs47, vs15, vs21 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 128,16 -.endm - - -.macro SAVE1x8 - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,64,16 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x4_2 - /*for load2 offset will be 128 and 32*/ - KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 -.endm - - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 64,16 -.endm - - - -.macro SAVE1x4 - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,32,16 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x2_2 - /*for load2 offset will be 64 and 32*/ - KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 -.endm - - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 32,16 -.endm - - - -.macro SAVE1x2 - SAVE2 vs32,vs33,vs34,vs35,CO,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - xxswapd vs17, vs16 - -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,16,16 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x1_2 - /*for load2 offset will be 32 and 32*/ - KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 -.endm - - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 16,16 -.endm - - - -.macro SAVE1x1 - SAVE1 vs32,vs33,CO,0 - addi CO, CO, 16 -.endm - -/****************************TRMM POINTER REFRESH - -.macroSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 8 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 4 - .endif -.endm -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ - - -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ - - -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - #endif - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,128,32 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 128,32 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,64,32 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 + +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs41, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs43, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs47, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 64,32 +.endm + + + +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,32,32 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 + +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 +.endm + + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs37, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 32,32 +.endm + + + +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,16,32 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs35, vs8, vs23 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,128,16 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 +.endm + + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 128,16 +.endm + + +.macro SAVE1x8 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,64,16 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 +.endm + + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 64,16 +.endm + + + +.macro SAVE1x4 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,32,16 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 +.endm + + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 32,16 +.endm + + + +.macro SAVE1x2 + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,16,16 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index ef156fd27..76ea12fee 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -1,6806 +1,6806 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* SGEMM_DEFAULT_UNROLL_N 4 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 384 -* A_PR1 512 -* B_PR1 512 -* -* -* 2014/07/28 Saar -* Performance at 9216x9216x9216: -* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) -* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) -* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) -* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) -* -*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define BO2 %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#if defined(OS_WINDOWS) -#define L_BUFFER_SIZE 8192 -#else -#define L_BUFFER_SIZE 12288 -#endif - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 - -#else - -#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 6 lines of N -*******************************************************************************************/ - -.macro KERNEL16x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vmovups -8 * SIZE(AO), %ymm1 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - prefetcht0 A_PR1(AO) - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 16*SIZE, AO - decq %rax -.endm - -.macro SAVE16x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm13, %ymm13 - vmulps %ymm0 , %ymm14, %ymm14 - vmulps %ymm0 , %ymm15, %ymm15 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 - - vaddps (CO2), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2), %ymm11,%ymm11 - - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 - - vaddps (CO2, LDC,2), %ymm14,%ymm14 - vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) - - vmovups %ymm10, (CO2) - vmovups %ymm11, 8 * SIZE(CO2) - - vmovups %ymm12, (CO2, LDC) - vmovups %ymm13, 8 * SIZE(CO2, LDC) - - vmovups %ymm14, (CO2, LDC,2) - vmovups %ymm15, 8 * SIZE(CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE8x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm14, %ymm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps (CO2), %ymm10,%ymm10 - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps (CO2, LDC,2), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm10, (CO2) - vmovups %ymm12, (CO2, LDC) - vmovups %ymm14, (CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x6_SUB - vmovups -16 * SIZE(AO), %xmm0 - vbroadcastss -4 * SIZE(BO), %xmm2 - vbroadcastss -3 * SIZE(BO), %xmm3 - - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - - vbroadcastss -2 * SIZE(BO), %xmm2 - vbroadcastss -1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - - vbroadcastss 0 * SIZE(BO), %xmm2 - vbroadcastss 1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE4x6 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - vmulps %xmm0 , %xmm12, %xmm12 - vmulps %xmm0 , %xmm14, %xmm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO1, LDC,2), %xmm8,%xmm8 - vaddps (CO2), %xmm10,%xmm10 - vaddps (CO2, LDC), %xmm12,%xmm12 - vaddps (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO1, LDC,2) - vmovups %xmm10, (CO2) - vmovups %xmm12, (CO2, LDC) - vmovups %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -15 * SIZE(AO), %xmm1 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE2x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm13, %xmm13 - vmulss %xmm0 , %xmm14, %xmm14 - vmulss %xmm0 , %xmm15, %xmm15 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 - - vaddss (CO2), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2), %xmm11,%xmm11 - - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 - - vaddss (CO2, LDC,2), %xmm14,%xmm14 - vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) - - vmovss %xmm10, (CO2) - vmovss %xmm11, 1 * SIZE(CO2) - - vmovss %xmm12, (CO2, LDC) - vmovss %xmm13, 1 * SIZE(CO2, LDC) - - vmovss %xmm14, (CO2, LDC,2) - vmovss %xmm15, 1 * SIZE(CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 1*SIZE, AO - decq %rax -.endm - -.macro SAVE1x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm14, %xmm14 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss (CO2), %xmm10,%xmm10 - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm10, (CO2) - vmovss %xmm12, (CO2, LDC) - vmovss %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - -/************************************************************************************* -* GEMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv6 // N / 12 - movq %rdx, Nmod6 // N % 12 - - movq Ndiv6, J - cmpq $0, J - je .L4_00 - ALIGN_4 - - -/*******************************************************************************************/ - -.L6_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq BO2, B // next offset of B - movq K, %rax - - ALIGN_4 - - -.L6_02c: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 4*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_02c - - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - - ALIGN_4 - -.L6_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x6_SUB - - jnz .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L6_60 // to next 6 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - ALIGN_4 - -.L6_20_7: - - KERNEL8x6_SUB - - jnz .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x6_SUB - - jnz .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - - ALIGN_4 - -.L6_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x6_SUB - - jnz .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L6_60 // to next 4 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x6_SUB - - jnz .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L6_60: - - -/*******************************************************************************************/ - - -.L7_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq K, %rax - - ALIGN_4 - - -.L7_02c: - - vmovsd 2*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_02c - - movq BO2, B // next offset of B - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - - ALIGN_4 - -.L7_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_4 - -.L7_17: - - KERNEL16x6_SUB - - jnz .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 6 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x6_SUB - - jnz .L7_20_7 - ALIGN_4 - - -.L7_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x6_SUB - - jnz .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x6_SUB - - jnz .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 4 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x6_SUB - - jnz .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L7_60: - - decq J // j -- - jg .L6_01 // next 12 lines of N - - - - -/*******************************************************************************************/ -.L4_00: - - movq Nmod6, J - sarq $2, J // j = j / 4 - cmpq $ 0, J - je .L2_00 - ALIGN_4 - - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_00: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#else - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#endif - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(AO), %ymm1 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm13, %ymm13 + vmulps %ymm0 , %ymm14, %ymm14 + vmulps %ymm0 , %ymm15, %ymm15 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 + + vaddps (CO2), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2), %ymm11,%ymm11 + + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 + + vaddps (CO2, LDC,2), %ymm14,%ymm14 + vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) + + vmovups %ymm10, (CO2) + vmovups %ymm11, 8 * SIZE(CO2) + + vmovups %ymm12, (CO2, LDC) + vmovups %ymm13, 8 * SIZE(CO2, LDC) + + vmovups %ymm14, (CO2, LDC,2) + vmovups %ymm15, 8 * SIZE(CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h index 36b7aa1a3..970d63578 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h @@ -1,226 +1,226 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ -/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ -/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ - -#define init_m8n4(c1,c2,c3,c4)\ - "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ - "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" -#define INIT_m8n4 init_m8n4(4,5,6,7) -#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) -#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) - -#define init_m4n4(c1,c2,c3,c4)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ - "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" -#define INIT_m4n4 init_m4n4(4,5,6,7) -#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) -#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) - -#define init_m2n4(c1,c2)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" -#define INIT_m2n4 init_m2n4(4,5) -#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) -#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) - -#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" -#define INIT_m1n4 init_m1n4(4) -#define INIT_m1n8 INIT_m1n4 init_m1n4(5) -#define INIT_m1n12 INIT_m1n8 init_m1n4(6) - -#define GEMM_KERNEL_k1m8n4 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ - "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ - "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" -#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" -#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" - -#define GEMM_KERNEL_k1m4n4 \ - "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ - "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ - "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ - "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" -#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ - "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" - -#define GEMM_KERNEL_k1m2n4 \ - "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ - "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" -#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ - "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ - "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" - -#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" -#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" -#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" - -#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" - -#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ - "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" - -#define GEMM_SUM_REORDER_2x4(c1,c2)\ - "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ - "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ - "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ - -#define GEMM_SUM_REORDER_1x4(c1)\ - "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" - -#define SOLVE_le_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovsldup %%ymm"#c1",%%ymm1;" - -#define SOLVE_le_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" - -#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_ri_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovshdup %%ymm"#c1",%%ymm1;" - -#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" - -#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ - "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ - "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ - "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m4n2(c1,a_off)\ - "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ - "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m1n4(c1,a_off)\ - "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ + "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ + "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ + "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2)\ + "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ + "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define SOLVE_le_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_le_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_ri_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m4n2(c1,a_off)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m1n4(c1,a_off)\ + "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S index 94e2f6117..6c8b4c872 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S +++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S @@ -1,1404 +1,1404 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S index 848b6f237..bffe5439d 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -1,1429 +1,1429 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/30 Saar -* -* Parameter: -* UNROLL_M 2 -* UNROLL_N 2 -* ZGEMM_P 384 -* ZGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) -* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) -* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) -* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) -* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) -* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) -* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) -* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) -* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index f91bfa89b..29729b101 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -1,3881 +1,3881 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/******************************************************************************** -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* ZGEMM_DEFAULT_UNROLL_N 2 -* ZGEMM_DEFAULT_UNROLL_M 4 -* ZGEMM_DEFAULT_P 256 -* ZGEMM_DEFAULT_Q 128 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/28 Saar -* Performance at 4608x4608x4608: -* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) -* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) -* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) -* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) -* -********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(BULLDOZER) - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#endif - -#else - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#endif - -#endif - -#define A_PR1 512 -#define B_PR1 512 - - - -/***************************************************************************************************/ - -.macro KERNEL4x3_SUB - vmovups (AO), %ymm0 - vmovups 4 * SIZE(AO), %ymm1 - prefetcht0 A_PR1(AO) - - vbroadcastsd (BO), %ymm2 - vbroadcastsd 1 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) - - vbroadcastsd 2 * SIZE(BO), %ymm2 - vbroadcastsd 3 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) - - vbroadcastsd 4 * SIZE(BO), %ymm2 - vbroadcastsd 5 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 - vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - vaddsubpd %ymm4 , %ymm5 ,%ymm5 - vaddsubpd %ymm6 , %ymm7 ,%ymm7 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - vmovapd %ymm5 , %ymm4 - vmovapd %ymm7 , %ymm6 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - vmulpd %ymm4 , %ymm0, %ymm4 - vmulpd %ymm6 , %ymm0, %ymm6 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - vmulpd %ymm5 , %ymm1, %ymm5 - vmulpd %ymm7 , %ymm1, %ymm7 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - - vaddpd (CO1, LDC,2), %ymm4 , %ymm4 - vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - vmovups %ymm4 , (CO1, LDC, 2) - vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - - - -/***************************************************************************************************/ - -.macro KERNEL2x3_SUB - vmovups (AO), %xmm0 - vmovups 2 * SIZE(AO), %xmm1 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE2x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - vaddsubpd %xmm4, %xmm5 ,%xmm5 - vaddsubpd %xmm6, %xmm7 ,%xmm7 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - vmovapd %xmm5, %xmm4 - vmovapd %xmm7, %xmm6 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - vmulpd %xmm4 , %xmm0, %xmm4 - vmulpd %xmm6 , %xmm0, %xmm6 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - vmulpd %xmm5 , %xmm1, %xmm5 - vmulpd %xmm7 , %xmm1, %xmm7 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - - vaddpd (CO1, LDC,2), %xmm4 , %xmm4 - vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - - vmovups %xmm4 , (CO1, LDC,2) - vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) - -.endm - - -/************************************************************************************************/ - - -.macro KERNEL1x3_SUB - vmovups (AO), %xmm0 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE1x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - vaddsubpd %xmm4, %xmm5, %xmm5 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm5, %xmm4 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm4 , %xmm0, %xmm4 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm5 , %xmm1, %xmm5 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - -#ifndef TRMMKERNEL - - vaddpd (CO1) , %xmm8 , %xmm8 - vaddpd (CO1, LDC) , %xmm10, %xmm10 - vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm4 , (CO1, LDC,2) - -.endm - - - - -/***************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - - vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) - - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.macro KERNEL1x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 - vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13,%ymm12 , %ymm12 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - -#else - vaddsubpd %ymm8, %ymm9 , %ymm9 - vaddsubpd %ymm12,%ymm13, %ymm13 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm13, %ymm12 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm13, %ymm1, %ymm13 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13, %ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 ,4 * SIZE(CO1) - -.endm - - - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - - -/************************************************************************************************/ - - - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -/************************************************************************************************/ -.L6_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L2_00_0 - ALIGN_4 - - - -.L6_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq BO2, B // next offset of B - movq K, %rax - ALIGN_4 - -.L6_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups (BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_00_02b - -.L6_00_02c: - - - -.L6_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L6_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L6_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_4_16 - ALIGN_4 - -.L6_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - jmp .L6_4_12 - ALIGN_4 - -.L6_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_4_19 - ALIGN_4 - -.L6_4_17: - - KERNEL4x3_SUB - - jnz .L6_4_17 - ALIGN_4 - - -.L6_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L6_2_10: - testq $ 2, M - jz .L6_2_40 // to next 2 lines of N - -.L6_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_16 - ALIGN_4 - -.L6_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - jmp .L6_2_12 - ALIGN_4 - -.L6_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_19 - ALIGN_4 - -.L6_2_17: - - KERNEL2x3_SUB - - jnz .L6_2_17 - ALIGN_4 - - -.L6_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_2_40: - testq $ 1, M - jz .L6_2_60 // to next 2 lines of N - - ALIGN_4 - -.L6_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_46 - - ALIGN_4 - -.L6_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - jmp .L6_2_42 - ALIGN_4 - -.L6_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_49 - - ALIGN_4 - -.L6_2_47: - - KERNEL1x3_SUB - - jnz .L6_2_47 - ALIGN_4 - - -.L6_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L6_2_41 - ALIGN_4 - - - - -.L6_2_60: - - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.L7_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq K, %rax - ALIGN_4 - -.L7_00_02b: - - vmovups 2 * SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_00_02b - -.L7_00_02c: - - movq BO2, B // next offset of B - - -.L7_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L7_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L7_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_4_16 - ALIGN_4 - -.L7_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - jmp .L7_4_12 - ALIGN_4 - -.L7_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_4_19 - - ALIGN_4 - -.L7_4_17: - - KERNEL4x3_SUB - - jnz .L7_4_17 - ALIGN_4 - - -.L7_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L7_2_10: - testq $ 2, M - jz .L7_2_40 // to next 2 lines of N - -.L7_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_16 - ALIGN_4 - -.L7_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - jmp .L7_2_12 - ALIGN_4 - -.L7_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_19 - - ALIGN_4 - -.L7_2_17: - - KERNEL2x3_SUB - - jnz .L7_2_17 - ALIGN_4 - - -.L7_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_2_40: - testq $ 1, M - jz .L7_2_60 // to next 2 lines of N - - ALIGN_4 - -.L7_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_46 - - ALIGN_4 - -.L7_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - jmp .L7_2_42 - ALIGN_4 - -.L7_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_49 - ALIGN_4 - -.L7_2_47: - - KERNEL1x3_SUB - - jnz .L7_2_47 - ALIGN_4 - - -.L7_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L7_2_41 - ALIGN_4 - - - - -.L7_2_60: - - decq J // j -- - jg .L6_00_01 // next 6 lines of N - -/************************************************************************************************/ - - - -/************************************************************************************************/ -.L2_00_0: - - movq Nmod6, J - sarq $1, J // j = j / 2 - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************************ - TRMM Kernel -************************************************************************************************/ - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - -#endif - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/******************************************************************************** +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* ZGEMM_DEFAULT_UNROLL_N 2 +* ZGEMM_DEFAULT_UNROLL_M 4 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/28 Saar +* Performance at 4608x4608x4608: +* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) +* +********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#endif + +#endif + +#define A_PR1 512 +#define B_PR1 512 + + + +/***************************************************************************************************/ + +.macro KERNEL4x3_SUB + vmovups (AO), %ymm0 + vmovups 4 * SIZE(AO), %ymm1 + prefetcht0 A_PR1(AO) + + vbroadcastsd (BO), %ymm2 + vbroadcastsd 1 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) + + vbroadcastsd 2 * SIZE(BO), %ymm2 + vbroadcastsd 3 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) + + vbroadcastsd 4 * SIZE(BO), %ymm2 + vbroadcastsd 5 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 + vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + vaddsubpd %ymm4 , %ymm5 ,%ymm5 + vaddsubpd %ymm6 , %ymm7 ,%ymm7 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + vmovapd %ymm5 , %ymm4 + vmovapd %ymm7 , %ymm6 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + vmulpd %ymm4 , %ymm0, %ymm4 + vmulpd %ymm6 , %ymm0, %ymm6 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + vmulpd %ymm5 , %ymm1, %ymm5 + vmulpd %ymm7 , %ymm1, %ymm7 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + + vaddpd (CO1, LDC,2), %ymm4 , %ymm4 + vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + vmovups %ymm4 , (CO1, LDC, 2) + vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + + + +/***************************************************************************************************/ + +.macro KERNEL2x3_SUB + vmovups (AO), %xmm0 + vmovups 2 * SIZE(AO), %xmm1 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE2x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + vaddsubpd %xmm4, %xmm5 ,%xmm5 + vaddsubpd %xmm6, %xmm7 ,%xmm7 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + vmovapd %xmm5, %xmm4 + vmovapd %xmm7, %xmm6 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + vmulpd %xmm4 , %xmm0, %xmm4 + vmulpd %xmm6 , %xmm0, %xmm6 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + vmulpd %xmm5 , %xmm1, %xmm5 + vmulpd %xmm7 , %xmm1, %xmm7 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + + vaddpd (CO1, LDC,2), %xmm4 , %xmm4 + vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + + vmovups %xmm4 , (CO1, LDC,2) + vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) + +.endm + + +/************************************************************************************************/ + + +.macro KERNEL1x3_SUB + vmovups (AO), %xmm0 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE1x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + vaddsubpd %xmm4, %xmm5, %xmm5 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm5 , %xmm1, %xmm5 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + +#ifndef TRMMKERNEL + + vaddpd (CO1) , %xmm8 , %xmm8 + vaddpd (CO1, LDC) , %xmm10, %xmm10 + vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm4 , (CO1, LDC,2) + +.endm + + + + +/***************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + + vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) + + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.macro KERNEL1x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 + vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13,%ymm12 , %ymm12 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + +#else + vaddsubpd %ymm8, %ymm9 , %ymm9 + vaddsubpd %ymm12,%ymm13, %ymm13 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm13, %ymm12 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm13, %ymm1, %ymm13 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13, %ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + + +/************************************************************************************************/ + + + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +/************************************************************************************************/ +.L6_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L2_00_0 + ALIGN_4 + + + +.L6_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq BO2, B // next offset of B + movq K, %rax + ALIGN_4 + +.L6_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups (BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_00_02b + +.L6_00_02c: + + + +.L6_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L6_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L6_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_16 + ALIGN_4 + +.L6_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + jmp .L6_4_12 + ALIGN_4 + +.L6_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_19 + ALIGN_4 + +.L6_4_17: + + KERNEL4x3_SUB + + jnz .L6_4_17 + ALIGN_4 + + +.L6_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L6_2_10: + testq $ 2, M + jz .L6_2_40 // to next 2 lines of N + +.L6_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_16 + ALIGN_4 + +.L6_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + jmp .L6_2_12 + ALIGN_4 + +.L6_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_19 + ALIGN_4 + +.L6_2_17: + + KERNEL2x3_SUB + + jnz .L6_2_17 + ALIGN_4 + + +.L6_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_2_40: + testq $ 1, M + jz .L6_2_60 // to next 2 lines of N + + ALIGN_4 + +.L6_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_46 + + ALIGN_4 + +.L6_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + jmp .L6_2_42 + ALIGN_4 + +.L6_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_49 + + ALIGN_4 + +.L6_2_47: + + KERNEL1x3_SUB + + jnz .L6_2_47 + ALIGN_4 + + +.L6_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L6_2_41 + ALIGN_4 + + + + +.L6_2_60: + + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.L7_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq K, %rax + ALIGN_4 + +.L7_00_02b: + + vmovups 2 * SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_00_02b + +.L7_00_02c: + + movq BO2, B // next offset of B + + +.L7_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L7_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L7_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_16 + ALIGN_4 + +.L7_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + jmp .L7_4_12 + ALIGN_4 + +.L7_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_19 + + ALIGN_4 + +.L7_4_17: + + KERNEL4x3_SUB + + jnz .L7_4_17 + ALIGN_4 + + +.L7_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L7_2_10: + testq $ 2, M + jz .L7_2_40 // to next 2 lines of N + +.L7_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_16 + ALIGN_4 + +.L7_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + jmp .L7_2_12 + ALIGN_4 + +.L7_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_19 + + ALIGN_4 + +.L7_2_17: + + KERNEL2x3_SUB + + jnz .L7_2_17 + ALIGN_4 + + +.L7_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_2_40: + testq $ 1, M + jz .L7_2_60 // to next 2 lines of N + + ALIGN_4 + +.L7_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_46 + + ALIGN_4 + +.L7_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + jmp .L7_2_42 + ALIGN_4 + +.L7_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_49 + ALIGN_4 + +.L7_2_47: + + KERNEL1x3_SUB + + jnz .L7_2_47 + ALIGN_4 + + +.L7_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L7_2_41 + ALIGN_4 + + + + +.L7_2_60: + + decq J // j -- + jg .L6_00_01 // next 6 lines of N + +/************************************************************************************************/ + + + +/************************************************************************************************/ +.L2_00_0: + + movq Nmod6, J + sarq $1, J // j = j / 2 + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************************ + TRMM Kernel +************************************************************************************************/ + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + +#endif + + diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt index 78fb1431f..b92089418 100644 --- a/relapack/src/CMakeLists.txt +++ b/relapack/src/CMakeLists.txt @@ -1,86 +1,86 @@ -include_directories(${PROJECT_SOURCE_DIR}) -include_directories(${PROJECT_BINARY_DIR}) -include_directories(${PROJECT_SOURCE_DIR}/relapack) - -set(RELAFILES -clauum.c -ctrsyl_rec2.c -dsytrf.c -spbtrf.c -strsyl_rec2.c -zhetrf_rook_rec2.c -ztrsyl.c -cgbtrf.c -cpbtrf.c -ctrtri.c -dsytrf_rec2.c -spotrf.c -strtri.c -zlauum.c -ztrsyl_rec2.c -cgemmt.c -cpotrf.c -dgbtrf.c -dsytrf_rook.c -lapack_wrappers.c -ssygst.c -zgbtrf.c -zpbtrf.c -ztrtri.c -cgetrf.c -csytrf.c -dgemmt.c -dsytrf_rook_rec2.c -ssytrf.c -zgemmt.c -zpotrf.c -chegst.c -csytrf_rec2.c -dgetrf.c -dtgsyl.c -ssytrf_rec2.c -zgetrf.c -zsytrf.c -chetrf.c -csytrf_rook.c -dlauum.c -dtrsyl.c -sgbtrf.c -ssytrf_rook.c -zhegst.c -zsytrf_rec2.c -chetrf_rec2.c -csytrf_rook_rec2.c -dpbtrf.c -dtrsyl_rec2.c -sgemmt.c -ssytrf_rook_rec2.c -zhetrf.c -zsytrf_rook.c -chetrf_rook.c -ctgsyl.c -dpotrf.c -dtrtri.c -sgetrf.c -stgsyl.c -zhetrf_rec2.c -zsytrf_rook_rec2.c -chetrf_rook_rec2.c -ctrsyl.c -dsygst.c -f2c.c -slauum.c -strsyl.c -zhetrf_rook.c -ztgsyl.c -) - - - -# add relapack folder to the sources -set(RELA_SOURCES "") -foreach (RELA_FILE ${RELAFILES}) - list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") -endforeach () -add_library(relapack_src OBJECT ${RELA_SOURCES}) -set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) +include_directories(${PROJECT_SOURCE_DIR}/relapack) + +set(RELAFILES +clauum.c +ctrsyl_rec2.c +dsytrf.c +spbtrf.c +strsyl_rec2.c +zhetrf_rook_rec2.c +ztrsyl.c +cgbtrf.c +cpbtrf.c +ctrtri.c +dsytrf_rec2.c +spotrf.c +strtri.c +zlauum.c +ztrsyl_rec2.c +cgemmt.c +cpotrf.c +dgbtrf.c +dsytrf_rook.c +lapack_wrappers.c +ssygst.c +zgbtrf.c +zpbtrf.c +ztrtri.c +cgetrf.c +csytrf.c +dgemmt.c +dsytrf_rook_rec2.c +ssytrf.c +zgemmt.c +zpotrf.c +chegst.c +csytrf_rec2.c +dgetrf.c +dtgsyl.c +ssytrf_rec2.c +zgetrf.c +zsytrf.c +chetrf.c +csytrf_rook.c +dlauum.c +dtrsyl.c +sgbtrf.c +ssytrf_rook.c +zhegst.c +zsytrf_rec2.c +chetrf_rec2.c +csytrf_rook_rec2.c +dpbtrf.c +dtrsyl_rec2.c +sgemmt.c +ssytrf_rook_rec2.c +zhetrf.c +zsytrf_rook.c +chetrf_rook.c +ctgsyl.c +dpotrf.c +dtrtri.c +sgetrf.c +stgsyl.c +zhetrf_rec2.c +zsytrf_rook_rec2.c +chetrf_rook_rec2.c +ctrsyl.c +dsygst.c +f2c.c +slauum.c +strsyl.c +zhetrf_rook.c +ztgsyl.c +) + + + +# add relapack folder to the sources +set(RELA_SOURCES "") +foreach (RELA_FILE ${RELAFILES}) + list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") +endforeach () +add_library(relapack_src OBJECT ${RELA_SOURCES}) +set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") From f73cfb7e2cf5a2a225bda6642c13c83b7bf9df29 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Nov 2022 09:39:56 +0100 Subject: [PATCH 101/154] change line endings from CRLF to LF --- kernel/x86_64/cgemm_kernel_4x2_bulldozer.S | 3794 +++--- kernel/x86_64/cgemm_kernel_4x2_piledriver.S | 3842 +++--- kernel/x86_64/cgemm_kernel_8x2_sandy.S | 4706 ++++---- kernel/x86_64/dgemm_kernel_16x2_haswell.S | 10430 ++++++++-------- kernel/x86_64/dgemm_kernel_4x4_haswell.S | 6988 +++++------ kernel/x86_64/dgemm_kernel_4x8_haswell.S | 10306 ++++++++-------- kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 1340 +-- kernel/x86_64/dgemm_kernel_8x2_bulldozer.S | 8826 +++++++------- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 9046 +++++++------- kernel/x86_64/sgemm_kernel_16x2_bulldozer.S | 10462 ++++++++-------- kernel/x86_64/sgemm_kernel_16x2_piledriver.S | 10516 ++++++++--------- kernel/x86_64/sgemm_kernel_16x4_sandy.S | 6334 +++++----- kernel/x86_64/strsm_kernel_8x4_haswell_RN.c | 558 +- kernel/x86_64/strsm_kernel_8x4_haswell_RT.c | 562 +- 14 files changed, 43855 insertions(+), 43855 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S index 97958a88f..2675f71fb 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S +++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S @@ -1,1897 +1,1897 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddps -#define VFMADD_I vfmaddps -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddps -#define VFMADD_I vfmaddps -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddps -#define VFMADD_I vfnmaddps -#else -#define VFMADD_R vfnmaddps -#define VFMADD_I vfnmaddps -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - - - -/************************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $8, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_20: - testq $3, M - jz .L2_60 // to next 2 lines of N - - testq $2, M - jz .L2_40 - ALIGN_4 - -.L2_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL2x2_SUB(xxx) - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - - vmovsd %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_20: - testq $3, M - jz .L999 - - testq $2, M - jz .L1_40 - ALIGN_4 - -.L1_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL2x1_SUB(xxx) - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_40: - testq $1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S index 72deee12f..bf7f91ee9 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S +++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S @@ -1,1921 +1,1921 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/31 Saar -* -* Parameter: -* UNROLL_M 4 -* UNROLL_N 2 -* CGEMM_P 768 -* CGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) -* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) -* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) -* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) -* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) -* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) -* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) -* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) -* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddps -#define VFMADD_I vfmaddps -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddps -#define VFMADD_I vfmaddps -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddps -#define VFMADD_I vfnmaddps -#else -#define VFMADD_R vfnmaddps -#define VFMADD_I vfnmaddps -#endif - - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - - - -/************************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $8, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_20: - testq $3, M - jz .L2_60 // to next 2 lines of N - - testq $2, M - jz .L2_40 - ALIGN_4 - -.L2_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL2x2_SUB(xxx) - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - - vmovsd %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_20: - testq $3, M - jz .L999 - - testq $2, M - jz .L1_40 - ALIGN_4 - -.L1_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL2x1_SUB(xxx) - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_40: - testq $1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 4 +* UNROLL_N 2 +* CGEMM_P 768 +* CGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) +* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) +* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) +* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) +* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) +* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) +* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) +* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) +* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_8x2_sandy.S b/kernel/x86_64/cgemm_kernel_8x2_sandy.S index c85646d43..988913591 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_sandy.S +++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S @@ -1,2353 +1,2353 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/29 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* CGEMM_DEFAULT_UNROLL_N 2 -* CGEMM_DEFAULT_UNROLL_M 8 -* CGEMM_DEFAULT_P 768 -* CGEMM_DEFAULT_Q 512 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/29 Saar -* Performance at 6192x6192x6192: -* 1 thread: 49 GFLOPS (MKL: 52) -* 2 threads: 99 GFLOPS (MKL: 102) -* 3 threads: 148 GFLOPS (MKL: 150) -* 4 threads: 195 GFLOPS (MKL: 194) -* 8 threads: 354 GFLOPS (MKL: 317) -* -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vaddps y0,%ymm2,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vaddps y0,%ymm3,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vaddps y0,%xmm2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vaddps y0,%xmm3,y0 - - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vsubps %ymm2,y0,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vaddps y0,%ymm3,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vsubps %xmm2,y0,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vaddps y0,%xmm3,y0 - - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vaddps y0,%ymm2,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vsubps %ymm3,y0,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vaddps y0,%xmm2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vsubps %xmm3,y0,y0 - - -#else - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vsubps %ymm2,y0,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vsubps %ymm3,y0,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vsubps %xmm2,y0,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vsubps %xmm3,y0,y0 - - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/***************************************************************************************************************************/ - -.macro KERNEL8x2_1 - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - prefetcht0 A_PR1(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+64(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+128(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+192(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - addq $ 16, BI - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - addq $ 64, %rax -.endm - - -.macro KERNEL8x2_SUB - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm10, %ymm11,%ymm11 - vaddsubps %ymm12, %ymm13,%ymm13 - vaddsubps %ymm14, %ymm15,%ymm15 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm11, %ymm10 - vmovaps %ymm13, %ymm12 - vmovaps %ymm15, %ymm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm10, %ymm0, %ymm10 - vmulps %ymm12, %ymm0, %ymm12 - vmulps %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm11, %ymm1, %ymm11 - vmulps %ymm13, %ymm1, %ymm13 - vmulps %ymm15, %ymm1, %ymm15 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - - vaddps (CO1, LDC), %ymm10, %ymm10 - vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 8 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - vmovsd %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm12, %ymm13,%ymm13 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm13, %ymm12 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm13, %ymm1, %ymm13 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L2_4_10 - - ALIGN_4 -/**********************************************************************************************************/ - -.L2_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - je .L2_8_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - je .L2_8_16 - - jmp .L2_8_12 - ALIGN_4 - -.L2_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_17: - - KERNEL8x2_SUB - - jl .L2_8_17 - ALIGN_4 - - -.L2_8_19: - - SAVE8x2 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_8_11 - ALIGN_4 - - -/**********************************************************************************************************/ - - - - -.L2_4_10: - testq $ 7, M - jz .L2_4_60 // to next 2 lines of N - - testq $ 4, M - jz .L2_4_20 - ALIGN_4 - - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_4_20: - - testq $ 2, M - jz .L2_4_40 - ALIGN_4 - -.L2_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - jmp .L2_4_22 - ALIGN_4 - -.L2_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_27: - - KERNEL2x2_SUB - - jl .L2_4_27 - ALIGN_4 - - -.L2_4_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_4_21 - ALIGN_4 - - - -/**************************************************************************/ -.L2_4_40: - testq $ 1, M - jz .L2_4_60 // to next 2 lines of N - - ALIGN_4 - -.L2_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - jmp .L2_4_42 - ALIGN_4 - -.L2_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_47: - - KERNEL1x2_SUB - - jl .L2_4_47 - ALIGN_4 - - -.L2_4_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_4_41 - ALIGN_4 - - - - -.L2_4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L1_4_10 - - ALIGN_4 - -/**************************************************************************************************/ - -.L1_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - jmp .L1_8_12 - ALIGN_4 - -.L1_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_17: - - KERNEL8x1_SUB - - jl .L1_8_17 - ALIGN_4 - - -.L1_8_19: - - SAVE8x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_8_11 - ALIGN_4 - - - -/**************************************************************************************************/ -.L1_4_10: - - testq $ 7, M - jz .L999 - - testq $ 4, M - jz .L1_4_20 - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_4_20: - - testq $ 2, M - jz .L1_4_40 - ALIGN_4 - -.L1_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - jmp .L1_4_22 - ALIGN_4 - -.L1_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_27: - - KERNEL2x1_SUB - - jl .L1_4_27 - ALIGN_4 - - -.L1_4_29: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_4_40: - testq $ 1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - jmp .L1_4_42 - ALIGN_4 - -.L1_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_47: - - KERNEL1x1_SUB - - jl .L1_4_47 - ALIGN_4 - - -.L1_4_49: - - SAVE1x1 - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* CGEMM_DEFAULT_UNROLL_N 2 +* CGEMM_DEFAULT_UNROLL_M 8 +* CGEMM_DEFAULT_P 768 +* CGEMM_DEFAULT_Q 512 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/29 Saar +* Performance at 6192x6192x6192: +* 1 thread: 49 GFLOPS (MKL: 52) +* 2 threads: 99 GFLOPS (MKL: 102) +* 3 threads: 148 GFLOPS (MKL: 150) +* 4 threads: 195 GFLOPS (MKL: 194) +* 8 threads: 354 GFLOPS (MKL: 317) +* +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vaddps y0,%ymm2,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vaddps y0,%ymm3,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vaddps y0,%xmm2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vaddps y0,%xmm3,y0 + + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vsubps %ymm2,y0,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vaddps y0,%ymm3,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vsubps %xmm2,y0,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vaddps y0,%xmm3,y0 + + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vaddps y0,%ymm2,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vsubps %ymm3,y0,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vaddps y0,%xmm2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vsubps %xmm3,y0,y0 + + +#else + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vsubps %ymm2,y0,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vsubps %ymm3,y0,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vsubps %xmm2,y0,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vsubps %xmm3,y0,y0 + + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/***************************************************************************************************************************/ + +.macro KERNEL8x2_1 + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + prefetcht0 A_PR1(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+64(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+128(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+192(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + addq $ 16, BI + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + addq $ 64, %rax +.endm + + +.macro KERNEL8x2_SUB + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + je .L2_8_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $ 7, M + jz .L2_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $ 2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $ 1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $ 7, M + jz .L999 + + testq $ 4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $ 2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $ 1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index 98b582c0d..899c5f241 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -1,5215 +1,5215 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/10/20 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK - -* -* -* 2013/10/20 Saar -* Parameter: -* DGEMM_DEFAULT_UNROLL_N 2 -* DGEMM_DEFAULT_UNROLL_M 16 -* DGEMM_DEFAULT_P 192 -* DGEMM_DEFAULT_Q 128 -* A_PR1 512 -* -* -* Performance without prefetch of B: -* 1 thread: 45.8 GFLOPS (MKL: 45) -* 2 threads: 80.0 GFLOPS (MKL: 91) -* 4 threads: 135.0 GFLOPS (MKL: 135) -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 512*8*4 -#define LB2_OFFSET 512*8*2 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -.macro VFMADD231PD_ y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmaddsd \x0,\x1,\x2,\x0 -.endm - -#else - -.macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y2,\y1,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x2,\x1,\x0 -.endm - -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -.macro KERNEL16x3_SUBN - prefetcht0 A_PR1(AO) - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovaps -12 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 A_PR1+64(AO) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovaps -8 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovaps -4 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $ 3*SIZE , BO - addq $ 16*SIZE, AO -.endm - - -.macro KERNEL8x3_SUBN - //prefetcht0 A_PR1(AO) - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovaps -12 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - //prefetcht0 A_PR1+64(AO) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - prefetcht0 B_PR1(BO) - addq $ 3*SIZE , BO - addq $ 8*SIZE, AO -.endm - -.macro KERNEL4x3_SUBN - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $ 3*SIZE , BO - addq $ 4*SIZE, AO -.endm - -.macro KERNEL2x3_SUBN - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -10 * SIZE(BO), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -15 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $ 3*SIZE , BO - addq $ 2*SIZE, AO -.endm - -.macro KERNEL1x3_SUBN - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -10 * SIZE(BO), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $ 3*SIZE , BO - addq $ 1*SIZE, AO -.endm - - - - - - -/******************************************************************************************/ - -.macro KERNEL16x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - - - - -.macro KERNEL16x3_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - prefetcht0 A_PR1+64(AO,%rax,SIZE) - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - addq $12, BI - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $64, %rax - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $3 , BI - addq $16, %rax -.endm - -.macro SAVE16x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm15, %ymm15 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 - vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) - vmovups %ymm15,12 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $12, BI - addq $32, %rax -.endm - -.macro KERNEL8x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $3 , BI - addq $8 , %rax -.endm - -.macro SAVE8x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_2 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_3 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_4 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $12, BI - addq $16, %rax -.endm - -.macro KERNEL4x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $3 , BI - addq $4 , %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $12, BI - addq $8, %rax -.endm - -.macro KERNEL2x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $3 , BI - addq $2 , %rax -.endm - -.macro SAVE2x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm12, %xmm12 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) - -.endm - -/*******************************************************************************************/ - -.macro KERNEL1x3_1 - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $12, BI - addq $4, %rax -.endm - -.macro KERNEL1x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $3 , BI - addq $1 , %rax -.endm - -.macro SAVE1x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $8, BI - addq $64, %rax -.endm - -.macro KERNEL16x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $2, BI - addq $16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $8, BI - addq $32, %rax -.endm - -.macro KERNEL8x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $2, BI - addq $8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_2 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_3 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_4 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $8, BI - addq $16, %rax -.endm - -.macro KERNEL4x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $2, BI - addq $4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $8, BI - addq $8, %rax -.endm - -.macro KERNEL2x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $2, BI - addq $2, %rax -.endm - -.macro SAVE2x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_1 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $8, BI - addq $4, %rax -.endm - -.macro KERNEL1x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $2, BI - addq $1, %rax -.endm - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $4, BI - addq $64, %rax -.endm - -.macro KERNEL16x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $1, BI - addq $16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $4, BI - addq $32, %rax -.endm - -.macro KERNEL8x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $1, BI - addq $8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $4, BI - addq $16, %rax -.endm - -.macro KERNEL4x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $1, BI - addq $4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $4, BI - addq $8, %rax -.endm - -.macro KERNEL2x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $1, BI - addq $2 , %rax -.endm - -.macro SAVE2x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro KERNEL1x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $ 1, BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovups 0 * SIZE(BO1), %xmm0 - vmovsd 0 * SIZE(BO2), %xmm2 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm2, 2*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups 0*SIZE(BO2), %xmm1 - vmovsd %xmm0, 0*SIZE(BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - prefetcht0 (CO1) - prefetcht0 (CO1,LDC,1) - prefetcht0 (CO1,LDC,2) - prefetcht0 64(CO1) - prefetcht0 64(CO1,LDC,1) - prefetcht0 64(CO1,LDC,2) - - vzeroall - - movq K, %rax - - sarq $1, %rax // K / 8 - je .L6_16 - - ALIGN_5 - -.L6_12: -/* - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) -*/ - KERNEL16x3_SUBN - KERNEL16x3_SUBN -/* - KERNEL16x3_SUBN - KERNEL16x3_SUBN - - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN -*/ - dec %rax - jne .L6_12 - -.L6_16: - movq K, %rax - - andq $1, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUBN - - dec %rax - jne .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - dec %rax - jne .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUBN - - dec %rax - jne .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - dec %rax - jne .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUBN - - dec %rax - jne .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_36 - ALIGN_4 - -.L6_32: - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - dec %rax - jne .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUBN - - dec %rax - jne .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3,%rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - dec %rax - jne .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUBN - - dec %rax - jne .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - prefetcht0 (CO1) - prefetcht0 (CO1,LDC,1) - prefetcht0 (CO1,LDC,2) - prefetcht0 64(CO1) - prefetcht0 64(CO1,LDC,1) - prefetcht0 64(CO1,LDC,2) - - vzeroall - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L7_16 - ALIGN_5 - -.L7_12: -/* - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) -*/ - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - dec %rax - jne .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_5 - -.L7_17: - - KERNEL16x3_SUBN - - dec %rax - jne .L7_17 - - -.L7_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - dec %rax - jne .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUBN - - dec %rax - jne .L7_20_7 - ALIGN_4 - -.L7_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - dec %rax - jne .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUBN - - dec %rax - jne .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - dec %rax - jne .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUBN - - dec %rax - jne .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - dec %rax - jne .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUBN - - dec %rax - jne .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovaps -8 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovaps -4 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + //prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + //prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + + + + +.macro KERNEL16x3_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + prefetcht0 A_PR1+64(AO,%rax,SIZE) + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + addq $12, BI + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $64, %rax + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm15, %ymm15 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 + vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) + vmovups %ymm15,12 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S index 0a2ca7ae3..29501df8e 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -1,3494 +1,3494 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -/********************************************************************* -* 2013/10/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK - -* -* -* 2013/10/27 Saar -* Parameter: -* DGEMM_DEFAULT_UNROLL_N 4 -* DGEMM_DEFAULT_UNROLL_M 4 -* DGEMM_DEFAULT_P 512 -* DGEMM_DEFAULT_Q 256 -* A_PR1 512 -* B_PR1 512 -* -* -* Performance at 9216x9216x9216: -* 1 thread: 53.3 GFLOPS (MKL: 54) -* 2 threads: 100.0 GFLOPS (MKL: 97) -* 3 threads: 147.0 GFLOPS (MKL: 133) -* 4 threads: 184.0 GFLOPS (MKL: 170) -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv12 // N / 12 - movq %rdx, Nmod12 // N % 12 - - - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $2,%rax // K * 4 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $1 , %rax // K / 2 - jz .L12_01a_2 - ALIGN_4 - -.L12_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetcht0 512(BO3) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm5 - vmovups 0 * SIZE(BO2), %ymm2 - vmovups 4 * SIZE(BO2), %ymm6 - vmovups 0 * SIZE(BO3), %ymm3 - vmovups 4 * SIZE(BO3), %ymm7 - - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - - vmovups %ymm5, 12 * SIZE(BO) - vmovups %ymm6, 16 * SIZE(BO) - vmovups %ymm7, 20 * SIZE(BO) - - addq $ 8 * SIZE ,BO1 - addq $ 8 * SIZE ,BO2 - addq $ 8 * SIZE ,BO3 - addq $ 24 *SIZE ,BO - - decq %rax - jnz .L12_01a_1 - - - -.L12_01a_2: - - movq K, %rax - andq $1, %rax // K % 2 - jz .L12_03c - ALIGN_4 - - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 0 * SIZE(BO2), %ymm2 - vmovups 0 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 4*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - movq BO3, B // next offset of B - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - decq J // j -- - jg .L12_01 - - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $2, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L4_10 - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv12 // N / 4 - movq %rdx, Nmod12 // N % 4 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - - - movq Ndiv12, J - cmpq $ 0, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L4_10 - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/********************************************************************* +* 2013/10/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/27 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 4 +* DGEMM_DEFAULT_P 512 +* DGEMM_DEFAULT_Q 256 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 9216x9216x9216: +* 1 thread: 53.3 GFLOPS (MKL: 54) +* 2 threads: 100.0 GFLOPS (MKL: 97) +* 3 threads: 147.0 GFLOPS (MKL: 133) +* 4 threads: 184.0 GFLOPS (MKL: 170) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $ 0xb1 , %ymm13, %ymm13 + vpermpd $ 0xb1 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv12 // N / 12 + movq %rdx, Nmod12 // N % 12 + + + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $2,%rax // K * 4 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $1 , %rax // K / 2 + jz .L12_01a_2 + ALIGN_4 + +.L12_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetcht0 512(BO3) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm5 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 4 * SIZE(BO2), %ymm6 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups 4 * SIZE(BO3), %ymm7 + + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + + vmovups %ymm5, 12 * SIZE(BO) + vmovups %ymm6, 16 * SIZE(BO) + vmovups %ymm7, 20 * SIZE(BO) + + addq $ 8 * SIZE ,BO1 + addq $ 8 * SIZE ,BO2 + addq $ 8 * SIZE ,BO3 + addq $ 24 *SIZE ,BO + + decq %rax + jnz .L12_01a_1 + + + +.L12_01a_2: + + movq K, %rax + andq $1, %rax // K % 2 + jz .L12_03c + ALIGN_4 + + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 4*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + movq BO3, B // next offset of B + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + decq J // j -- + jg .L12_01 + + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $2, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv12 // N / 4 + movq %rdx, Nmod12 // N % 4 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + + + movq Ndiv12, J + cmpq $ 0, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 19e32ef2c..adaa28bbc 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1,5153 +1,5153 @@ -/********************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 160 -#define BROADCASTKERNEL - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 -# if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -# else - vmovups -12 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E -# if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -# else - vmovups -12 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - prefetcht0 BUFFER1 - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 64 + BUFFER1 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 > 32 - prefetcht0 128 + BUFFER1 -#endif - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 > 96 - prefetcht0 192 + BUFFER1 -#endif - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 -#endif - -#if B_PR1 > 160 - prefetcht0 256 + BUFFER1 -#endif - -#if defined BROADCASTKERNEL - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#endif - -#if B_PR1 > 224 - prefetcht0 320 + BUFFER1 -#endif - -#ifndef BROADCASTKERNEL - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#endif - -#if B_PR1 > 288 - prefetcht0 384 + BUFFER1 -#endif - -#ifndef BROADCASTKERNEL - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - -#if B_PR1 > 352 - prefetcht0 448 + BUFFER1 -#endif - leaq (CO1, LDC, 2), %rax - -#if B_PR1 > 416 - prefetcht0 512 + BUFFER1 -#endif - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht1 56(CO1) - prefetcht1 56(CO1,LDC) - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 - vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 - vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 - vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm9, %ymm9 - vpermilpd $ 0x05 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 - vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - prefetcht1 56(%rbp) - prefetcht1 56(%rbp,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 - vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 - vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 - vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm13, %ymm13 - vpermilpd $ 0x05 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - prefetcht1 56(%rbp) - prefetcht1 56(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ - - -.macro INIT4x8 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - -.endm - -.macro KERNEL4x8_I - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vmulpd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M1 - prefetcht0 A_PR1(AO) -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M2 -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -4 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 0 * SIZE(BO), %ymm2 - addq $ 16*SIZE, BO -.endm - - -.macro KERNEL4x8_E -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - addq $ 8*SIZE, BO -.endm - -.macro KERNEL4x8_SUB - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 8*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - -.endm - - -.macro SAVE4x8 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 - vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 - vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 - vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm9 , %ymm9 - vpermilpd $ 0x05 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL2x8_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 2*SIZE, AO - -.endm - -.macro SAVE2x8 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL1x8_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 1*SIZE, AO - -.endm - -.macro SAVE1x8 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - - -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -.macro PREFETCHT0_C - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) -.endm -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $24, %rdi - divq %rdi // N / 24 - movq %rax, Ndiv12 // N / 24 - movq %rdx, Nmod12 // N % 24 - - - movq Ndiv12, J - cmpq $ 0, J - je .L8_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values from BO1 - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - movq BO2 , B - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm2 - vmovups 0 * SIZE(BO2), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - prefetcht0 ALPHA - PREFETCHT0_C - addq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - leaq (CO1,LDC,2),CO1 - KERNEL4x12_M2 - PREFETCHT0_C - subq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - subq LDC,CO1 - subq LDC,CO1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - /* here for the prefetch of next b source block */ - /* the increment should be proportional to GEMM_Q/GEMM_P */ - - salq $3, K -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - prefetcht2 32(B) - prefetcht2 32(B, K, 8) - addq $64, B /* increment */ -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - prefetcht2 32(B) - prefetcht2 32(B, K, 8) - prefetcht2 96(B) - prefetcht2 96(B, K, 8) - addq $128, B /* increment */ -#endif - sarq $3, K - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ - - /* recover the original value of pointer B after prefetch */ - movq M, I - sarq $2, I -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - salq $6, I -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - salq $7, I -#endif - subq I, B - -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - - -/**************************************************************************************************/ - -.L13_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values - movq B, BO2 - leaq (B,%rax, SIZE), BO3 // next offset to BO2 - leaq (BO3,%rax, SIZE), B // next offset to B - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - - -.L13_02b: - - vmovups 4 * SIZE(BO2), %ymm1 - vmovups 0 * SIZE(BO3), %ymm2 - vmovups 4 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO2 - addq $ 8*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L13_02b - - - -.L13_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L13_20 - - ALIGN_4 - -.L13_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L13_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L13_12a - - ALIGN_5 -.L13_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L13_12 - -.L13_12a: - prefetcht0 ALPHA - PREFETCHT0_C - addq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - leaq (CO1,LDC,2),CO1 - KERNEL4x12_M2 - PREFETCHT0_C - subq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - subq LDC,CO1 - subq LDC,CO1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - -.L13_13: - - test $1, %rax - jz .L13_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - - -.L13_14: - - INIT4x12 - - -.L13_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_19 - - ALIGN_4 - -.L13_17: - - KERNEL4x12_SUB - - dec %rax - jne .L13_17 - ALIGN_4 - - -.L13_19: - - SAVE4x12 - - /* here for the prefetch of next b source block */ - /* the increment should be proportional to GEMM_Q/GEMM_P */ - - salq $3, K -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - prefetcht2 (B) - prefetcht2 (B, K, 8) - addq $64, B /* increment */ -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - prefetcht2 (B) - prefetcht2 (B, K, 8) - prefetcht2 64(B) - prefetcht2 64(B, K, 8) - addq $128, B /* increment */ -#endif - sarq $3, K - - decq I # i -- - jne .L13_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ - /* recover the original value of pointer B */ - movq M, I - sarq $2, I -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - salq $6, I -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - salq $7, I -#endif - subq I, B - -.L13_20: - // Test rest of M - - testq $3, M - jz .L13_100 // to next 16 lines of N - - -.L13_30: - testq $2, M - jz .L13_40 - - ALIGN_4 - -.L13_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L13_36 - ALIGN_4 - -.L13_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L13_32 - ALIGN_4 - -.L13_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_39 - - ALIGN_4 - -.L13_37: - - KERNEL2x12_SUB - - dec %rax - jne .L13_37 - ALIGN_4 - - -.L13_39: - - SAVE2x12 - - ALIGN_4 - -.L13_40: - testq $1, M - jz .L13_100 // to next 3 lines of N - - ALIGN_4 - -.L13_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L13_46 - - ALIGN_4 - -.L13_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L13_42 - ALIGN_4 - -.L13_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_49 - - ALIGN_4 - -.L13_47: - - KERNEL1x12_SUB - - dec %rax - jne .L13_47 - ALIGN_4 - - -.L13_49: - - SAVE1x12 - - ALIGN_4 - -.L13_100: - - decq J // j -- - jg .L12_01 - - - - -/**************************************************************************************************/ - -.L8_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $3, J // j = j / 8 - je .L4_0 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x8 - - movq K, %rax - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x8 - - movq K, %rax - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - - ALIGN_4 - -.L8_100: - - movq K, %rax - salq $3, %rax // * 8 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L8_10 - - - -/**************************************************************************************************/ - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - testq $4, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $8, %rdi - divq %rdi // N / 8 - movq %rax, Ndiv12 // N / 8 - movq %rdx, Nmod12 // N % 8 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -/*************************************************************************************************/ -.L8_0: - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 8 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x8 - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x8 - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L8_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $8, KK // number of values in B -#endif - - - decq J // j -- - jg .L8_10 - - - - - -/*************************************************************************************************/ -.L4_0: - movq Nmod12, J - testq $4, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 160 +#define BROADCASTKERNEL + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else + vmovups -12 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else + vmovups -12 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + prefetcht0 BUFFER1 + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + prefetcht0 64 + BUFFER1 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 > 32 + prefetcht0 128 + BUFFER1 +#endif + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 > 96 + prefetcht0 192 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + +#if B_PR1 > 160 + prefetcht0 256 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + +#if B_PR1 > 224 + prefetcht0 320 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + +#if B_PR1 > 288 + prefetcht0 384 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + +#if B_PR1 > 352 + prefetcht0 448 + BUFFER1 +#endif + leaq (CO1, LDC, 2), %rax + +#if B_PR1 > 416 + prefetcht0 512 + BUFFER1 +#endif + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ + + +.macro INIT4x8 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + +.endm + +.macro KERNEL4x8_I + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M1 + prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -4 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 0 * SIZE(BO), %ymm2 + addq $ 16*SIZE, BO +.endm + + +.macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + addq $ 8*SIZE, BO +.endm + +.macro KERNEL4x8_SUB + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + +.endm + + +.macro SAVE4x8 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL2x8_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 2*SIZE, AO + +.endm + +.macro SAVE2x8 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL1x8_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 1*SIZE, AO + +.endm + +.macro SAVE1x8 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + + +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) +.endm +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $24, %rdi + divq %rdi // N / 24 + movq %rax, Ndiv12 // N / 24 + movq %rdx, Nmod12 // N % 24 + + + movq Ndiv12, J + cmpq $ 0, J + je .L8_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values from BO1 + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + movq BO2 , B + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm2 + vmovups 0 * SIZE(BO2), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 + KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ + + /* recover the original value of pointer B after prefetch */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + + +/**************************************************************************************************/ + +.L13_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values + movq B, BO2 + leaq (B,%rax, SIZE), BO3 // next offset to BO2 + leaq (BO3,%rax, SIZE), B // next offset to B + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + + +.L13_02b: + + vmovups 4 * SIZE(BO2), %ymm1 + vmovups 0 * SIZE(BO3), %ymm2 + vmovups 4 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO2 + addq $ 8*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L13_02b + + + +.L13_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L13_20 + + ALIGN_4 + +.L13_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L13_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L13_12a + + ALIGN_5 +.L13_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L13_12 + +.L13_12a: + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 + KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + +.L13_13: + + test $1, %rax + jz .L13_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_14: + + INIT4x12 + + +.L13_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_19 + + ALIGN_4 + +.L13_17: + + KERNEL4x12_SUB + + dec %rax + jne .L13_17 + ALIGN_4 + + +.L13_19: + + SAVE4x12 + + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + + decq I # i -- + jne .L13_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ + /* recover the original value of pointer B */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + +.L13_20: + // Test rest of M + + testq $3, M + jz .L13_100 // to next 16 lines of N + + +.L13_30: + testq $2, M + jz .L13_40 + + ALIGN_4 + +.L13_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L13_36 + ALIGN_4 + +.L13_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L13_32 + ALIGN_4 + +.L13_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_39 + + ALIGN_4 + +.L13_37: + + KERNEL2x12_SUB + + dec %rax + jne .L13_37 + ALIGN_4 + + +.L13_39: + + SAVE2x12 + + ALIGN_4 + +.L13_40: + testq $1, M + jz .L13_100 // to next 3 lines of N + + ALIGN_4 + +.L13_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L13_46 + + ALIGN_4 + +.L13_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L13_42 + ALIGN_4 + +.L13_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_49 + + ALIGN_4 + +.L13_47: + + KERNEL1x12_SUB + + dec %rax + jne .L13_47 + ALIGN_4 + + +.L13_49: + + SAVE1x12 + + ALIGN_4 + +.L13_100: + + decq J // j -- + jg .L12_01 + + + + +/**************************************************************************************************/ + +.L8_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $3, J // j = j / 8 + je .L4_0 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x8 + + movq K, %rax + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x8 + + movq K, %rax + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + + ALIGN_4 + +.L8_100: + + movq K, %rax + salq $3, %rax // * 8 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L8_10 + + + +/**************************************************************************************************/ + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + testq $4, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $8, %rdi + divq %rdi // N / 8 + movq %rax, Ndiv12 // N / 8 + movq %rdx, Nmod12 // N % 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +/*************************************************************************************************/ +.L8_0: + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 8 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x8 + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x8 + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L8_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK // number of values in B +#endif + + + decq J // j -- + jg .L8_10 + + + + + +/*************************************************************************************************/ +.L4_0: + movq Nmod12, J + testq $4, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 90a4c2b1d..a5daffb94 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -1,670 +1,670 @@ -#include "common.h" -#include -#include - -//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. - -/* row-major c_block */ -#define INNER_KERNEL_k1m1n8 \ - "prefetcht0 384(%1);"\ - "vmovupd (%1),%%zmm5; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" - -#define INNER_KERNEL_k1m2n8 \ - INNER_KERNEL_k1m1n8\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" - -#define INNER_KERNEL_k1m1n16 \ - "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ - "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" - -#define INNER_KERNEL_k1m2n16 \ - INNER_KERNEL_k1m1n16\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" - -#define INNER_KERNEL_k1m1n24 \ - "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ - "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" - -#define INNER_KERNEL_k1m2n24 \ - INNER_KERNEL_k1m1n24\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" - -/* row-major z-partition c_block */ -#define INNER_KERNEL_k1m4n8 \ - "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ - "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ - "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" - -#define INNER_KERNEL_k1m4n16 \ - INNER_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ - "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" - -#define INNER_KERNEL_k1m4n24 \ - INNER_KERNEL_k1m4n16\ - "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ - "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" - -#define INNER_KERNEL_k1m8n8 \ - "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ - "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ - "prefetcht0 128(%1);"\ - "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ - "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" - -#define INNER_KERNEL_k1m8n16 \ - INNER_KERNEL_k1m8n8\ - "prefetcht0 128(%1,%%r12,2);"\ - "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ - "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" - -#define INNER_KERNEL_k1m8n24 \ - INNER_KERNEL_k1m8n16\ - "prefetcht0 128(%1,%%r12,4);"\ - "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ - "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" - -/* micro kernels */ -#define INNER_KERNELm1(nn) \ - "cmpq $1,%2;jb "#nn"3f;"\ - #nn"4:\n\t"\ - INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ - "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ - #nn"3:\n\t" - -#define INNER_KERNELm2(nn) \ - "cmpq $1,%2;jb "#nn"0f;"\ - #nn"1:\n\t"\ - INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ - "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ - #nn"0:\n\t" - -#define INNER_KERNELm4(nn) \ - "cmpq $1,%2;jb "#nn"00f;"\ - #nn"01:\n\t"\ - INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ - "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ - #nn"00:\n\t" - -/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ -#define INNER_KERNELm8(nn) \ - "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ - #nn"008:\n\t"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%11); addq $32,%11;"\ - "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ - "movq %3,%10;"\ - #nn"001:\n\t"\ - "cmpq $1,%2;jb "#nn"000f;"\ - "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "decq %2;jmp "#nn"001b;"\ - ""#nn"000:\n\t" - -#define INNER_INIT_m1n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8;" - -#define INNER_INIT_m2n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" - -#define INNER_INIT_m4n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" - -#define INNER_INIT_m8n8 \ - INNER_INIT_m4n8\ - "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" - -#define INNER_INIT_m1n16 INNER_INIT_m2n8 - -#define INNER_INIT_m2n16 INNER_INIT_m4n8 - -#define INNER_INIT_m4n16 INNER_INIT_m8n8 - -#define INNER_INIT_m8n16 \ - INNER_INIT_m8n8\ - "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ - "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" - -#define INNER_INIT_m1n24 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" - -#define INNER_INIT_m2n24 \ - INNER_INIT_m1n24\ - "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" - -#define INNER_INIT_m4n24 \ - INNER_INIT_m4n16\ - "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" - -#define INNER_INIT_m8n24 \ - INNER_INIT_m8n16\ - "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ - "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" - -#define INNER_SETINDEX \ - "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ - "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" - -#define INNER_STORE_m1n8(c1,disp) \ - "kxnorw %%k1,%%k1,%%k1;"\ - "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ - "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ - "kxnorw %%k1,%%k1,%%k1;"\ - "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" - -#define INNER_SAVE_m1n8 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0) - -#define INNER_SAVE_m1n16 \ - INNER_SAVE_m1n8\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0) - -#define INNER_SAVE_m1n24 \ - INNER_SAVE_m1n16\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm10,0) - -#define INNER_SAVE_m2n8 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm9,8) - -#define INNER_SAVE_m2n16 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm10,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0)\ - INNER_STORE_m1n8(%%zmm11,8) - -#define INNER_SAVE_m2n24 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm11,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0)\ - INNER_STORE_m1n8(%%zmm12,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm10,0)\ - INNER_STORE_m1n8(%%zmm13,8) - -#define INNER_TRANS_4x8(c1,c2,c3,c4) \ - "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ - "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ - "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ - "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ - -#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ - "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ - "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ - "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ - "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" - -#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) - -//%7 for k01(input) only when m=4 -#define INNER_STORE_4x8(c1,c2,c3,c4) \ - "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ - "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ - "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ - "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ - "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "leaq (%10,%4,4),%10;" - -#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" - -#define INNER_SAVE_m4n8 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ - INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) - -#define INNER_SAVE_m4n16 \ - INNER_SAVE_m4n8\ - INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ - INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) - -#define INNER_SAVE_m4n24 \ - INNER_SAVE_m4n16\ - INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ - INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) - -#define INNER_SAVE_m8n8 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ - INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) - -#define INNER_SAVE_m8n16 \ - INNER_SAVE_m8n8\ - INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ - INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) - -#define INNER_SAVE_m8n24 \ - INNER_SAVE_m8n16\ - INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ - INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) - -#define COMPUTE_n8 {\ - b_pref = packed_b_pointer + 8 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 42222f;"\ - "42221:\n\t"\ - INNER_INIT_m8n8\ - INNER_KERNELm8(8)\ - INNER_SAVE_m8n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ - "42222:\n\t"\ - "cmpq $4,%8; jb 42223f;"\ - INNER_INIT_m4n8\ - INNER_KERNELm4(8)\ - INNER_SAVE_m4n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "42223:\n\t"\ - "cmpq $2,%8; jb 42224f;"\ - INNER_INIT_m2n8\ - INNER_KERNELm2(8)\ - INNER_SAVE_m2n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "42224:\n\t"\ - "cmpq $1,%8; jb 42225f;"\ - INNER_INIT_m1n8\ - INNER_KERNELm1(8)\ - INNER_SAVE_m1n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "42225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -#define COMPUTE_n16 {\ - b_pref = packed_b_pointer + 16 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 32222f;"\ - "32221:\n\t"\ - INNER_INIT_m8n16\ - INNER_KERNELm8(16)\ - INNER_SAVE_m8n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ - "32222:\n\t"\ - "cmpq $4,%8; jb 32223f;"\ - INNER_INIT_m4n16\ - INNER_KERNELm4(16)\ - INNER_SAVE_m4n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "32223:\n\t"\ - "cmpq $2,%8; jb 32224f;"\ - INNER_INIT_m2n16\ - INNER_KERNELm2(16)\ - INNER_SAVE_m2n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "32224:\n\t"\ - "cmpq $1,%8; jb 32225f;"\ - INNER_INIT_m1n16\ - INNER_KERNELm1(16)\ - INNER_SAVE_m1n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "32225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ - "leaq (%1,%%r12,4),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ - "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -#define COMPUTE_n24 {\ - b_pref = packed_b_pointer + 24 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 22222f;"\ - "22221:\n\t"\ - INNER_INIT_m8n24\ - INNER_KERNELm8(24)\ - INNER_SAVE_m8n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ - "22222:\n\t"\ - "cmpq $4,%8; jb 22223f;"\ - INNER_INIT_m4n24\ - INNER_KERNELm4(24)\ - INNER_SAVE_m4n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "22223:\n\t"\ - "cmpq $2,%8; jb 22224f;"\ - INNER_INIT_m2n24\ - INNER_KERNELm2(24)\ - INNER_SAVE_m2n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "22224:\n\t"\ - "cmpq $1,%8; jb 22225f;"\ - INNER_INIT_m1n24\ - INNER_KERNELm1(24)\ - INNER_SAVE_m1n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "22225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ - "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ - "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ - "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 -//perform C += A B - if(k==0 || m==0 || ndiv8==0) return; - int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); - int64_t K = (int64_t)k; int64_t M = (int64_t)m; - double *a_block_pointer,*b_pref; - double *c_pointer = c,*c_store = c; - __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; - BLASLONG ndiv8_count; - double *packed_b_pointer = packed_b; - a_block_pointer = packed_a; - for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ - COMPUTE_n24 - } - for(;ndiv8_count>1;ndiv8_count-=2){ - COMPUTE_n16 - } - if(ndiv8_count>0){ - COMPUTE_n8 - } -} - -/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ -/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ -/* double accumulator: sc1; temporary variables: sa1,sb1 */ -/* column-major c_block */ -#define KERNEL_m4n4k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ - yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ - b_block_pointer+=4;\ -} -#define KERNEL_m4n2k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ - b_block_pointer+=2;\ -} -#define KERNEL_m4n1k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - b_block_pointer++;\ -} -#define INIT_m4n1 yc1=_mm256_setzero_pd(); -#define INIT_m4n2 yc2=INIT_m4n1 -#define INIT_m4n4 yc4=yc3=INIT_m4n2 -#define SAVE_m4n1 {\ - yb1 = _mm256_broadcast_sd(alpha);\ - ya1 = _mm256_loadu_pd(c_pointer);\ - yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ - _mm256_storeu_pd(c_pointer,yc1);\ - c_pointer += 4;\ -} -#define SAVE_m4n2 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ - c_pointer += 4;\ -} -#define SAVE_m4n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ - c_pointer += LDC*2;\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ - c_pointer += 4-LDC*2;\ -} -#define KERNEL_m2n2k1 {\ - xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ - xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ - b_block_pointer += 2;\ -} -#define KERNEL_m2n1k1 {\ - xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ - xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - b_block_pointer ++;\ -} -#define INIT_m2n1 xc1=_mm_setzero_pd(); -#define INIT_m2n2 xc2=INIT_m2n1 -#define SAVE_m2n1 {\ - xb1 = _mm_loaddup_pd(alpha);\ - xa1 = _mm_loadu_pd(c_pointer);\ - xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ - _mm_storeu_pd(c_pointer,xc1);\ - c_pointer += 2;\ -} -#define SAVE_m2n2 {\ - xa1 = _mm_loaddup_pd(alpha);\ - xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ - xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ - _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ - c_pointer += 2;\ -} -#define KERNEL_m1n1k1 {\ - sa1 = *a_block_pointer; a_block_pointer++;\ - sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ - b_block_pointer ++;\ -} -#define INIT_m1n1 sc1=0.0; -#define SAVE_m1n1 {\ - *c_pointer += sc1 * (*alpha);\ - c_pointer++;\ -} -/* row-major c_block */ -#define KERNEL_m2n4k1 {\ - yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ - ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ - a_block_pointer += 2;\ -} -#define KERNEL_m1n4k1 {\ - yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ - ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - a_block_pointer ++;\ -} -#define KERNEL_m1n2k1 {\ - xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ - xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - a_block_pointer ++;\ -} -#define INIT_m1n2 INIT_m2n1 -#define INIT_m1n4 INIT_m4n1 -#define INIT_m2n4 INIT_m4n2 -#define SAVE_m2n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yc1 = _mm256_mul_pd(yc1,ya1);\ - yc2 = _mm256_mul_pd(yc2,ya1);\ - yb1 = _mm256_unpacklo_pd(yc1,yc2);\ - yb2 = _mm256_unpackhi_pd(yc1,yc2);\ - xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ - xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ - _mm_storeu_pd(c_pointer,xb1);\ - _mm_storeu_pd(c_pointer+LDC,xb2);\ - xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ - xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ - _mm_storeu_pd(c_pointer+2*LDC,xb1);\ - _mm_storeu_pd(c_pointer+3*LDC,xb2);\ - c_pointer += 2;\ -} -#define SAVE_m1n2 {\ - xb1 = _mm_loaddup_pd(alpha);\ - xc1 = _mm_mul_pd(xc1,xb1);\ - *c_pointer += _mm_cvtsd_f64(xc1);\ - xa1 = _mm_unpackhi_pd(xc1,xc1);\ - c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ - c_pointer ++;\ -} -#define SAVE_m1n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yc1 = _mm256_mul_pd(yc1,ya1);\ - xb1 = _mm256_extractf128_pd(yc1,0);\ - *c_pointer += _mm_cvtsd_f64(xb1);\ - xb2 = _mm_unpackhi_pd(xb1,xb1);\ - c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ - xb1 = _mm256_extractf128_pd(yc1,1);\ - c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ - xb2 = _mm_unpackhi_pd(xb1,xb1);\ - c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ - c_pointer ++;\ -} -static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 -//perform C += A B , edge_n<8 must be satisfied. - if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; - double *a_block_pointer,*b_block_pointer,*b_base_pointer; - double *c_pointer = c; - __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; - __m128d xc1,xc2,xa1,xb1,xb2; - double sc1,sa1,sb1; - BLASLONG m_count,n_count,k_count; - b_base_pointer = packed_b; -//now start calculation of the edge part - for(n_count=edge_n;n_count>3;n_count-=4){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n4 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n4 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n4 - for(k_count=0;k_count1;n_count-=2){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n2 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n2 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n2 - for(k_count=0;k_count0){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n1 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n1 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n1 - for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); - if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); - return 0; -} +#include "common.h" +#include +#include + +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. + +/* row-major c_block */ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +/* row-major z-partition c_block */ +#define INNER_KERNEL_k1m4n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m4n16\ + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" + +#define INNER_KERNEL_k1m8n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ + "prefetcht0 128(%1);"\ + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m8n8\ + "prefetcht0 128(%1,%%r12,2);"\ + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m8n16\ + "prefetcht0 128(%1,%%r12,4);"\ + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" + +/* micro kernels */ +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ +#define INNER_KERNELm8(nn) \ + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%11); addq $32,%11;"\ + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ + "movq %3,%10;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "decq %2;jmp "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) + +#define INNER_SAVE_m2n24 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) + +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ + +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" + +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) + +//%7 for k01(input) only when m=4 +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ + "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ + "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ + "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ + "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "leaq (%10,%4,4),%10;" + +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" + +#define INNER_SAVE_m4n8 \ + "movq %3,%10;"\ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) + +#define INNER_SAVE_m4n16 \ + INNER_SAVE_m4n8\ + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m4n24 \ + INNER_SAVE_m4n16\ + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) + +#define INNER_SAVE_m8n8 \ + "movq %3,%10;"\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m8n16 \ + INNER_SAVE_m8n8\ + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) + +#define INNER_SAVE_m8n24 \ + INNER_SAVE_m8n16\ + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) + +#define COMPUTE_n8 {\ + b_pref = packed_b_pointer + 8 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + b_pref = packed_b_pointer + 16 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + b_pref = packed_b_pointer + 24 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + INNER_KERNELm8(24)\ + INNER_SAVE_m8n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer,*b_pref; + double *c_pointer = c,*c_store = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } +} + +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1 * (*alpha);\ + c_pointer++;\ +} +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. + if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); + return 0; +} diff --git a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S index 40c5892c6..c353a5913 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S @@ -1,4413 +1,4413 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/06/02 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 360 -* DGEMM_Q 160 -* -* Performance at m x n without prefetch of BO: -* -* 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) -* 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) -* 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) -* -* 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) -* 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) -* 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) -* -*********************************************************************/ - -/********************************************************************* -* 2013/06/03 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 336 -* DGEMM_Q 168 -* NO_WARMUP 1 -* NO_AFFINITY 1 -* GEMM_MULTITHREAD_THRESHOLD 4 -* -* Performance at m x n with prefetch of BO: -* -* 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) -* 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) -* 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) -* -* 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) -* 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) -* 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) -* -*********************************************************************/ - -/********************************************************************* -* 2013/06/04 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 384 -* DGEMM_Q 168 -* NO_WARMUP 1 -* NO_AFFINITY 1 -* GEMM_MULTITHREAD_THRESHOLD 4 -* -* Performance at m x n with prefetch of BO: -* -* 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) -* 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) -* 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) -* -* 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) -* 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) -* 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - - - - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - - - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - - - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 - movq B, BO1 - leaq (B,%rax,8), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L6_02a - ALIGN_4 - -.L6_02: - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm2 - vmovups 4*SIZE(BO1), %xmm4 - vmovups 6*SIZE(BO1), %xmm6 - vmovsd (BO2), %xmm1 - vmovsd 2*SIZE(BO2), %xmm3 - vmovsd 4*SIZE(BO2), %xmm5 - vmovsd 6*SIZE(BO2), %xmm7 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - decq %rax - jnz .L6_02 - -.L6_02a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_02c - ALIGN_4 - -.L6_02b: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax,8), BO1 // next offset to BO1 - leaq (BO1,%rax,8), BO2 // next offset to BO1 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $2, %rax // k / 4 - jz .L6_03a - ALIGN_4 - - -.L6_03: - - prefetcht0 512(BO2) - prefetchw 512(BO) - vmovups (BO2), %xmm0 - vmovups 2*SIZE(BO2), %xmm2 - vmovups 4*SIZE(BO2), %xmm4 - vmovups 6*SIZE(BO2), %xmm6 - vmovsd 1*SIZE(BO1), %xmm1 - vmovsd 3*SIZE(BO1), %xmm3 - vmovsd 5*SIZE(BO1), %xmm5 - vmovsd 7*SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - decq %rax - jnz .L6_03 - -.L6_03a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_03c - ALIGN_4 - - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $7, M - jz .L7_10 // to next 3 lines of N - - testq $4, M - jz .L6_30 - - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L7_20 - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_11 - ALIGN_4 - -.L7_20: - // Test rest of M - - testq $7, M - jz .L7_60 // to next 6 lines of N - - testq $4, M - jz .L7_30 - - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - -.L7_40: - testq $1, M - jz .L7_60 // to next 6 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - - vmovsd %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_0: - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - vmulpd %xmm0, %xmm11,%xmm11 - vmulpd %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - vmulsd %xmm0, %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/06/02 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 360 +* DGEMM_Q 160 +* +* Performance at m x n without prefetch of BO: +* +* 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) +* 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) +* 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) +* +* 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) +* 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) +* 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) +* +*********************************************************************/ + +/********************************************************************* +* 2013/06/03 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 336 +* DGEMM_Q 168 +* NO_WARMUP 1 +* NO_AFFINITY 1 +* GEMM_MULTITHREAD_THRESHOLD 4 +* +* Performance at m x n with prefetch of BO: +* +* 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) +* 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) +* 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) +* +* 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) +* 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) +* 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) +* +*********************************************************************/ + +/********************************************************************* +* 2013/06/04 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 384 +* DGEMM_Q 168 +* NO_WARMUP 1 +* NO_AFFINITY 1 +* GEMM_MULTITHREAD_THRESHOLD 4 +* +* Performance at m x n with prefetch of BO: +* +* 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) +* 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) +* 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) +* +* 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) +* 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) +* 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 512(BO2) + prefetchw 512(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index adc00cca3..48eb1bcbe 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -1,4523 +1,4523 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/********************************************************************* -* -* 2013/11/13 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/31 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 768 -* DGEMM_Q 168 -* DGEMM_R 12288 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) -* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) -* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) -* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior -* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior -* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) -* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) -* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) -* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 - -#else - -#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 - -#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 - -#endif - - - - -#define A_PR1 512 -#define B_PR1 256 -#define C_PR1 64 - -.macro INIT8x3 - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 -.endm - -.macro KERNEL8x3_INIT - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - vmulpd %xmm1,%xmm0,%xmm4 - vmovddup -11 * SIZE(BO), %xmm2 - vmulpd %xmm2,%xmm0,%xmm5 - vmovddup -10 * SIZE(BO), %xmm3 - vmulpd %xmm3,%xmm0,%xmm6 - vmovups -14 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm7 - vmulpd %xmm2,%xmm0,%xmm8 - vmulpd %xmm3,%xmm0,%xmm9 - vmovups -12 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm10 - vmulpd %xmm2,%xmm0,%xmm11 - addq $ 3 * SIZE, BO - vmulpd %xmm3,%xmm0,%xmm12 - vmovups -10 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm13 - vmovddup -12 * SIZE(BO), %xmm1 - vmulpd %xmm2,%xmm0,%xmm14 - vmovddup -11 * SIZE(BO), %xmm2 - vmulpd %xmm3,%xmm0,%xmm15 -.endm - - -.macro KERNEL8x3_M1 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -12 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M2 - vmovups -8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+64(AO) - vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -9 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -8 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - - -.macro KERNEL8x3_M3 - vmovups 0 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+128(AO) - vmovddup -7 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -6 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -5 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M4 - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup -4 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -3 * SIZE(BO), %xmm1 - addq $ 32 * SIZE, AO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -2 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M5 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - vmovddup -1 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 0 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 1 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M6 - vmovups -8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+64(AO) - vmovddup 2 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 3 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 4 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - - -.macro KERNEL8x3_M7 - vmovups 0 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+128(AO) - vmovddup 5 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 6 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 7 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M8 - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 9 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 10 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) - vmovddup 11 * SIZE(BO), %xmm3 - addq $ 32 * SIZE, AO - addq $ 24 * SIZE, BO -.endm - - -.macro KERNEL8x3_E - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $ 32 * SIZE, AO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $ 21 * SIZE, BO - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_SUBN - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $ 3 * SIZE, BO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $ 8 * SIZE, AO - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro SAVE8x3 - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - prefetcht0 C_PR1(CO1) - prefetcht0 C_PR1(CO1,LDC) - prefetcht0 C_PR1(CO1,LDC,2) - - addq $ 8 * SIZE, CO1 # coffset += 8 -.endm - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - - - - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - - - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - - - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 - movq B, BO1 - leaq (B,%rax,8), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L6_02a - ALIGN_4 - -.L6_02: - prefetcht0 B_PR1(BO1) - prefetcht0 B_PR1(BO2) - prefetchw B_PR1(BO) - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm2 - vmovups 4*SIZE(BO1), %xmm4 - vmovups 6*SIZE(BO1), %xmm6 - vmovsd (BO2), %xmm1 - vmovsd 2*SIZE(BO2), %xmm3 - vmovsd 4*SIZE(BO2), %xmm5 - vmovsd 6*SIZE(BO2), %xmm7 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L6_02 - -.L6_02a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_02c - ALIGN_4 - -.L6_02b: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax,8), BO1 // next offset to BO1 - leaq (BO1,%rax,8), BO2 // next offset to BO1 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $2, %rax // k / 4 - jz .L6_03a - ALIGN_4 - - -.L6_03: - - prefetcht0 B_PR1(BO2) - prefetchw B_PR1(BO) - vmovups (BO2), %xmm0 - vmovups 2*SIZE(BO2), %xmm2 - vmovups 4*SIZE(BO2), %xmm4 - vmovups 6*SIZE(BO2), %xmm6 - vmovsd 1*SIZE(BO1), %xmm1 - vmovsd 3*SIZE(BO1), %xmm3 - vmovsd 5*SIZE(BO1), %xmm5 - vmovsd 7*SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L6_03 - -.L6_03a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_03c - ALIGN_4 - - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 1*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L6_20 - - ALIGN_4 - -.L6_11: - - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - movq K, %rax - sarq $3, %rax // K / 8 - cmpq $3, %rax - jl .L6_13 - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - subq $2, %rax - - ALIGN_5 - -.L6_12: - - prefetcht0 B_PR1-24(BO) - prefetcht0 B_PR1+40(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - prefetcht0 B_PR1+104(BO) - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - dec %rax - jne .L6_12 - -.L6_12_E: - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L6_16 - -.L6_13: - - test $2, %rax - jz .L6_14 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L6_16 - - -.L6_14: - - test $1, %rax - jz .L6_15 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - - jmp .L6_16 - -.L6_15: - - INIT8x3 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL8x3_SUBN - dec %rax - jne .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE8x3 - - decq I # i -- - jg .L6_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $7, M - jz .L7_10 // to next 3 lines of N - - testq $4, M - jz .L6_30 - - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L7_20 - ALIGN_4 - -.L7_11: - - leaq BUFFER2, BO // first buffer to BO - addq $12 * SIZE, BO - movq K, %rax - sarq $3, %rax // K / 8 - cmpq $3, %rax - jl .L7_13 - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - subq $2, %rax - - ALIGN_5 - -.L7_12: - - prefetcht0 B_PR1-24(BO) - prefetcht0 B_PR1+40(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - prefetcht0 B_PR1+104(BO) - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - dec %rax - jne .L7_12 - -.L7_12_E: - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - - -.L7_13: - - test $2, %rax - jz .L7_14 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - -.L7_14: - - test $1, %rax - jz .L7_15 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - - -.L7_15: - - INIT8x3 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - - ALIGN_4 - -.L7_17: - - KERNEL8x3_SUBN - dec %rax - jne .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE8x3 - - decq I # i -- - jg .L7_11 - ALIGN_4 - -.L7_20: - // Test rest of M - - testq $7, M - jz .L7_60 // to next 6 lines of N - - testq $4, M - jz .L7_30 - - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - -.L7_40: - testq $1, M - jz .L7_60 // to next 6 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - - vmovsd %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_0: - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - vmulpd %xmm0, %xmm11,%xmm11 - vmulpd %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - vmulsd %xmm0, %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#endif +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/********************************************************************* +* +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 768 +* DGEMM_Q 168 +* DGEMM_R 12288 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 + +#else + +#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 + +#endif + + + + +#define A_PR1 512 +#define B_PR1 256 +#define C_PR1 64 + +.macro INIT8x3 + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 +.endm + +.macro KERNEL8x3_INIT + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmulpd %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + vmulpd %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm7 + vmulpd %xmm2,%xmm0,%xmm8 + vmulpd %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm10 + vmulpd %xmm2,%xmm0,%xmm11 + addq $ 3 * SIZE, BO + vmulpd %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + vmulpd %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -12 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M2 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -8 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M3 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup -7 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -5 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M4 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup -4 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -3 * SIZE(BO), %xmm1 + addq $ 32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -2 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M5 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmovddup -1 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 0 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 1 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M6 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup 2 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 3 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 4 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M7 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup 5 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 7 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M8 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 10 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) + vmovddup 11 * SIZE(BO), %xmm3 + addq $ 32 * SIZE, AO + addq $ 24 * SIZE, BO +.endm + + +.macro KERNEL8x3_E + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $ 32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $ 21 * SIZE, BO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_SUBN + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $ 3 * SIZE, BO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $ 8 * SIZE, AO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro SAVE8x3 + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + + addq $ 8 * SIZE, CO1 # coffset += 8 +.endm + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 B_PR1(BO1) + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L6_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L6_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L6_12 + +.L6_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + +.L6_13: + + test $2, %rax + jz .L6_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + + +.L6_14: + + test $1, %rax + jz .L6_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + + jmp .L6_16 + +.L6_15: + + INIT8x3 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUBN + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE8x3 + + decq I # i -- + jg .L6_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L7_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L7_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L7_12 + +.L7_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_13: + + test $2, %rax + jz .L7_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + +.L7_14: + + test $1, %rax + jz .L7_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_15: + + INIT8x3 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUBN + dec %rax + jne .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE8x3 + + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S index 9cc27184d..b31a934f2 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S +++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S @@ -1,5231 +1,5231 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -#define KERNEL16x3_1(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_2(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_3(xx) \ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_4(xx) \ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - addq $12, BI ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $64, %rax ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL16x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL16x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovss 0 * SIZE(BO2), %xmm2 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovss 1*SIZE(BO1), %xmm0 - vmovsd 0*SIZE(BO2), %xmm1 - vmovss %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - - vmovss %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - vmulps %xmm0, %xmm11,%xmm11 - vmulps %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - vmulss %xmm0, %xmm5,%xmm5 - vmulss %xmm0, %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulps %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulss %xmm0, %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S index 7c42f1e12..35b01de07 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S +++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S @@ -1,5258 +1,5258 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2013/10/18 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/29 Saar -* -* Parameter: -* UNROLL_M 16 -* UNROLL_N 2 -* SGEMM_P 768 -* SGEMM_Q 192 -* SGEMM_R 12288 -* A_PR1 384 -* B_PR1 192 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) -* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) -* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) -* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) -* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) -* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) -* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) -* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) -* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -#define KERNEL16x3_1(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_2(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_3(xx) \ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_4(xx) \ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - addq $12, BI ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $64, %rax ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL16x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL16x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovss 0 * SIZE(BO2), %xmm2 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovss 1*SIZE(BO1), %xmm0 - vmovsd 0*SIZE(BO2), %xmm1 - vmovss %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - - vmovss %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - vmulps %xmm0, %xmm11,%xmm11 - vmulps %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - vmulss %xmm0, %xmm5,%xmm5 - vmulss %xmm0, %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulps %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulss %xmm0, %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/29 Saar +* +* Parameter: +* UNROLL_M 16 +* UNROLL_N 2 +* SGEMM_P 768 +* SGEMM_Q 192 +* SGEMM_R 12288 +* A_PR1 384 +* B_PR1 192 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) +* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) +* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) +* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) +* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) +* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) +* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) +* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) +* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_sandy.S b/kernel/x86_64/sgemm_kernel_16x4_sandy.S index ea15cd87e..2ee4b1554 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_sandy.S +++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S @@ -1,3167 +1,3167 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - vaddps %ymm14, %ymm6 , %ymm6 - vaddps %ymm15, %ymm7 , %ymm7 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm8 , %ymm8 - vaddps %ymm13, %ymm9 , %ymm9 - vaddps %ymm14, %ymm10, %ymm10 - vaddps %ymm15, %ymm11, %ymm11 - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm14, %ymm6 , %ymm6 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm8 , %ymm8 - vaddps %ymm14, %ymm10, %ymm10 - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm4 , %xmm4 - vaddps %xmm14, %xmm6 , %xmm6 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm8 , %xmm8 - vaddps %xmm14, %xmm10, %xmm10 - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - vaddss %xmm14, %xmm6 , %xmm6 - vaddss %xmm15, %xmm7 , %xmm7 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm8 , %xmm8 - vaddss %xmm13, %xmm9 , %xmm9 - vaddss %xmm14, %xmm10, %xmm10 - vaddss %xmm15, %xmm11, %xmm11 - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm14, %xmm6 , %xmm6 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm8 , %xmm8 - vaddss %xmm14, %xmm10, %xmm10 - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - vaddps %ymm14, %ymm6 , %ymm6 - vaddps %ymm15, %ymm7 , %ymm7 - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm14, %ymm6 , %ymm6 - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm4 , %xmm4 - vaddps %xmm14, %xmm6 , %xmm6 - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - vaddss %xmm14, %xmm6 , %xmm6 - vaddss %xmm15, %xmm7 , %xmm7 - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm14, %xmm6 , %xmm6 - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vmulps %ymm2 , %ymm0 , %ymm12 - vaddps %ymm12, %ymm4 , %ymm4 - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulps %xmm2 , %xmm0 , %xmm12 - vaddps %xmm12, %xmm4 , %xmm4 - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulss %xmm2 , %xmm0 , %xmm12 - vaddss %xmm12, %xmm4 , %xmm4 - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + vaddps %ymm14, %ymm6 , %ymm6 + vaddps %ymm15, %ymm7 , %ymm7 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm8 , %ymm8 + vaddps %ymm13, %ymm9 , %ymm9 + vaddps %ymm14, %ymm10, %ymm10 + vaddps %ymm15, %ymm11, %ymm11 + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm14, %ymm6 , %ymm6 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm8 , %ymm8 + vaddps %ymm14, %ymm10, %ymm10 + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm4 , %xmm4 + vaddps %xmm14, %xmm6 , %xmm6 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm8 , %xmm8 + vaddps %xmm14, %xmm10, %xmm10 + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + vaddss %xmm14, %xmm6 , %xmm6 + vaddss %xmm15, %xmm7 , %xmm7 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm8 , %xmm8 + vaddss %xmm13, %xmm9 , %xmm9 + vaddss %xmm14, %xmm10, %xmm10 + vaddss %xmm15, %xmm11, %xmm11 + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm14, %xmm6 , %xmm6 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm8 , %xmm8 + vaddss %xmm14, %xmm10, %xmm10 + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + vaddps %ymm14, %ymm6 , %ymm6 + vaddps %ymm15, %ymm7 , %ymm7 + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm14, %ymm6 , %ymm6 + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm4 , %xmm4 + vaddps %xmm14, %xmm6 , %xmm6 + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + vaddss %xmm14, %xmm6 , %xmm6 + vaddss %xmm15, %xmm7 , %xmm7 + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm14, %xmm6 , %xmm6 + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vmulps %ymm2 , %ymm0 , %ymm12 + vaddps %ymm12, %ymm4 , %ymm4 + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulps %xmm2 , %xmm0 , %xmm12 + vaddps %xmm12, %xmm4 , %xmm4 + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulss %xmm2 , %xmm0 , %xmm12 + vaddss %xmm12, %xmm4 , %xmm4 + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c index 4e2cd4fe6..dbfcd55d7 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c @@ -1,279 +1,279 @@ -#include "common.h" -#include -#include "strsm_kernel_8x4_haswell_R_common.h" - -#define SOLVE_RN_m8n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1)\ - SOLVE_ri_m8n2(56,6,7,%1)\ - SAVE_SOLUTION_m8n2(6,7,64) - -#define SOLVE_RN_m8n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(6,7,64)\ - SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(8,9,128)\ - SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(10,11,192) - -#define SOLVE_RN_m8n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(6,7,64)\ - SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(8,9,128)\ - SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(10,11,192)\ - SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(12,13,256)\ - SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(14,15,320) - -#define SOLVE_RN_m4n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1)\ - SOLVE_ri_m4n2(56,5,%1)\ - SAVE_SOLUTION_m4n2(5,32) - -#define SOLVE_RN_m4n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(5,32)\ - SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(6,64)\ - SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(7,96) - -#define SOLVE_RN_m4n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(5,32)\ - SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(6,64)\ - SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(7,96)\ - SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(8,128)\ - SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(9,160) - -#define SOLVE_RN_m2n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,0) - -#define SOLVE_RN_m2n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ - SAVE_SOLUTION_m2n4(4,5,0)\ - SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ - SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ - SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ - SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ - SAVE_SOLUTION_m2n4(6,7,32) - -#define SOLVE_RN_m2n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(4,5,0)\ - SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(6,7,32)\ - SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(8,9,64) - -#define SOLVE_RN_m1n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1)\ - SOLVE_col2_ltor_m1n4(16,4,%1)\ - SOLVE_col3_ltor_m1n4(32,4,%1)\ - SOLVE_col4_ltor_m1n4(48,4,%1)\ - SAVE_SOLUTION_m1n4(4,0) - -#define SOLVE_RN_m1n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ - SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ - SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ - SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ - SAVE_SOLUTION_m1n4(4,0)\ - SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ - SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ - SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ - SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ - SAVE_SOLUTION_m1n4(5,16) - -#define SOLVE_RN_m1n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(4,0)\ - SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(5,16)\ - SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(6,32) - -#define GEMM_RN_SIMPLE(mdim,ndim) \ - "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ - "1"#mdim""#ndim"1:\n\t"\ - GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ - "1"#mdim""#ndim"2:\n\t" -#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) -#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) -#define GEMM_RN_m8n12 \ - "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ - "cmpq $8,%5; jb 18122f;"\ - "18121:\n\t"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ - "18122:\n\t"\ - "testq %5,%5; jz 18124f;"\ - "18123:\n\t"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ - "18124:\n\t" -#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) -#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) -#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) -#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) -#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) -#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) -#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) -#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) -#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) - -#define COMPUTE(ndim) {\ - __asm__ __volatile__(\ - "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ - "cmpq $8,%%r11; jb "#ndim"772f;"\ - #ndim"771:\n\t"\ - GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ - #ndim"772:\n\t"\ - "testq $4,%%r11; jz "#ndim"773f;"\ - GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ - #ndim"773:\n\t"\ - "testq $2,%%r11; jz "#ndim"774f;"\ - GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ - #ndim"774:\n\t"\ - "testq $1,%%r11; jz "#ndim"775f;"\ - GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ - #ndim"775:\n\t"\ - "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ - :"r11","r12","r13","r14","r15","cc","memory",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ - a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ -} - -static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT a0, b0; - int i, j, k; - for (i=0; i7;m_count-=8){ - if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); - a_ptr += k * 8; c_ptr += 8; - } - for(;m_count>3;m_count-=4){ - if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); - a_ptr += k * 4; c_ptr += 4; - } - for(;m_count>1;m_count-=2){ - if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); - a_ptr += k * 2; c_ptr += 2; - } - if(m_count>0){ - if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); - a_ptr += k * 1; c_ptr += 1; - } -} -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ - float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; - float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; - BLASLONG n_count = n; - for(;n_count>11;n_count-=12) COMPUTE(12) - for(;n_count>7;n_count-=8) COMPUTE(8) - for(;n_count>3;n_count-=4) COMPUTE(4) - for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} - if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); - return 0; -} +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RN_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1)\ + SOLVE_ri_m8n2(56,6,7,%1)\ + SAVE_SOLUTION_m8n2(6,7,64) + +#define SOLVE_RN_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(10,11,192) + +#define SOLVE_RN_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(10,11,192)\ + SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(12,13,256)\ + SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(14,15,320) + +#define SOLVE_RN_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1)\ + SOLVE_ri_m4n2(56,5,%1)\ + SAVE_SOLUTION_m4n2(5,32) + +#define SOLVE_RN_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(7,96) + +#define SOLVE_RN_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(7,96)\ + SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(8,128)\ + SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(9,160) + +#define SOLVE_RN_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,0) + +#define SOLVE_RN_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(6,7,32) + +#define SOLVE_RN_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(6,7,32)\ + SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(8,9,64) + +#define SOLVE_RN_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1)\ + SOLVE_col2_ltor_m1n4(16,4,%1)\ + SOLVE_col3_ltor_m1n4(32,4,%1)\ + SOLVE_col4_ltor_m1n4(48,4,%1)\ + SAVE_SOLUTION_m1n4(4,0) + +#define SOLVE_RN_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(5,16) + +#define SOLVE_RN_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(5,16)\ + SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(6,32) + +#define GEMM_RN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) +#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) +#define GEMM_RN_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) +#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) +#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) +#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) +#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) +#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) +#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) +#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) +#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ +} + +static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0; i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c index ffcbfbbf0..9de3354de 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c @@ -1,281 +1,281 @@ -#include "common.h" -#include -#include "strsm_kernel_8x4_haswell_R_common.h" - -#define SOLVE_RT_m8n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-128) - -#define SOLVE_RT_m8n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ - SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-112,4,5,%1)\ - SOLVE_le_m8n2(-128,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-256) - -#define SOLVE_RT_m8n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ - SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ - SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ - SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ - SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ - SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-176,4,5,%1)\ - SOLVE_le_m8n2(-192,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-384) - -#define SOLVE_RT_m4n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(4,-64) - -#define SOLVE_RT_m4n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ - SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ - SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-112,4,%1)\ - SOLVE_le_m4n2(-128,4,%1)\ - SAVE_SOLUTION_m4n2(4,-128) - -#define SOLVE_RT_m4n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ - SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ - SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ - SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ - SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ - SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ - SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-176,4,%1)\ - SOLVE_le_m4n2(-192,4,%1)\ - SAVE_SOLUTION_m4n2(4,-192) - -#define SOLVE_RT_m2n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-32) - -#define SOLVE_RT_m2n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-64) - -#define SOLVE_RT_m2n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ - SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-96) - -#define SOLVE_RT_m1n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(4,-16) - -#define SOLVE_RT_m1n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-80,4,%1)\ - SOLVE_col3_rtol_m1n4(-96,4,%1)\ - SOLVE_col2_rtol_m1n4(-112,4,%1)\ - SOLVE_col1_rtol_m1n4(-128,4,%1)\ - SAVE_SOLUTION_m1n4(4,-32) - -#define SOLVE_RT_m1n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ - SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ - SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ - SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ - SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-144,4,%1)\ - SOLVE_col3_rtol_m1n4(-160,4,%1)\ - SOLVE_col2_rtol_m1n4(-176,4,%1)\ - SOLVE_col1_rtol_m1n4(-192,4,%1)\ - SAVE_SOLUTION_m1n4(4,-48) - -/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ -#define GEMM_RT_SIMPLE(mdim,ndim) \ - "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ - "1"#mdim""#ndim"1:\n\t"\ - "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ - "1"#mdim""#ndim"2:\n\t" -#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) -#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) -#define GEMM_RT_m8n12 \ - "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ - "cmpq $8,%5; jb 18122f;"\ - "18121:\n\t"\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ - "18122:\n\t"\ - "testq %5,%5; jz 18124f;"\ - "18123:\n\t"\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ - "18124:\n\t" -#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) -#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) -#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) -#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) -#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) -#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) -#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) -#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) -#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) - -#define COMPUTE(ndim) {\ - b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ - __asm__ __volatile__(\ - "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ - "cmpq $8,%%r11; jb "#ndim"772f;"\ - #ndim"771:\n\t"\ - GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ - #ndim"772:\n\t"\ - "testq $4,%%r11; jz "#ndim"773f;"\ - GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ - #ndim"773:\n\t"\ - "testq $2,%%r11; jz "#ndim"774f;"\ - GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ - #ndim"774:\n\t"\ - "testq $1,%%r11; jz "#ndim"775f;"\ - GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ - #ndim"775:\n\t"\ - "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ - :"r11","r12","r13","r14","r15","cc","memory",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ - a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ -} - -static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ - FLOAT a0, b0; - int i, j, k; - for (i=n-1;i>=0;i--) { - b0 = b[i*n+i]; - for (j=0;j7;m_count-=8){ - if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); - solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 8; c_ptr += 8; - } - for(;m_count>3;m_count-=4){ - if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); - solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 4; c_ptr += 4; - } - for(;m_count>1;m_count-=2){ - if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); - solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 2; c_ptr += 2; - } - if(m_count>0){ - if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); - solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 1; c_ptr += 1; - } -} -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ - float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; - float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; - BLASLONG n_count = n; - if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} - if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} - for(;n_count>11;n_count-=12) COMPUTE(12) - for(;n_count>7;n_count-=8) COMPUTE(8) - for(;n_count>3;n_count-=4) COMPUTE(4) - return 0; -} +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-128) + +#define SOLVE_RT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-256) + +#define SOLVE_RT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ + SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-176,4,5,%1)\ + SOLVE_le_m8n2(-192,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-384) + +#define SOLVE_RT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(4,-64) + +#define SOLVE_RT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(4,-128) + +#define SOLVE_RT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ + SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ + SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-176,4,%1)\ + SOLVE_le_m4n2(-192,4,%1)\ + SAVE_SOLUTION_m4n2(4,-192) + +#define SOLVE_RT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-32) + +#define SOLVE_RT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-64) + +#define SOLVE_RT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-96) + +#define SOLVE_RT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(4,-16) + +#define SOLVE_RT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(4,-32) + +#define SOLVE_RT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-144,4,%1)\ + SOLVE_col3_rtol_m1n4(-160,4,%1)\ + SOLVE_col2_rtol_m1n4(-176,4,%1)\ + SOLVE_col1_rtol_m1n4(-192,4,%1)\ + SAVE_SOLUTION_m1n4(4,-48) + +/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ +#define GEMM_RT_SIMPLE(mdim,ndim) \ + "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) +#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) +#define GEMM_RT_m8n12 \ + "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) +#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) +#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) +#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) +#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) +#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) +#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) +#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) +#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ +} + +static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ + FLOAT a0, b0; + int i, j, k; + for (i=n-1;i>=0;i--) { + b0 = b[i*n+i]; + for (j=0;j7;m_count-=8){ + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; + BLASLONG n_count = n; + if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} + if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + return 0; +} From f92dd6e303da101107246a0274b172659f8db3ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Nov 2022 10:18:36 +0100 Subject: [PATCH 102/154] change line endings from CRLF to LF --- benchmark/Makefile | 6878 ++++++++++++++++++++++---------------------- benchmark/amax.c | 266 +- benchmark/amin.c | 274 +- benchmark/hbmv.c | 268 +- benchmark/hpmv.c | 266 +- benchmark/iamin.c | 240 +- benchmark/imax.c | 228 +- benchmark/imin.c | 228 +- benchmark/max.c | 226 +- benchmark/min.c | 226 +- benchmark/rotm.c | 276 +- benchmark/spmv.c | 292 +- 12 files changed, 4834 insertions(+), 4834 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a..d9ddb9042 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1,3439 +1,3439 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -# ACML standard -#ACML=/opt/acml5.3.1/gfortran64_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML custom -#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML 6.1 custom -ACML=/home/saar/acml6.1/gfortran64_mp/lib -LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - - -# Atlas Ubuntu -#ATLAS=/usr/lib/atlas-base -#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Atlas RHEL and Fedora -ATLAS=/usr/lib64/atlas -LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Intel standard -# MKL=/opt/intel/mkl/lib/intel64 -# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Intel custom -MKL=/home/saar/intel_mkl -LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Apple vecLib -LIBVECLIB = -framework Accelerate - -ESSL=/opt/ibm/lib -#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a -LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a - -ifneq ($(NO_LAPACK), 1) -GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - csymv.goto zsymv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto -else -GOTO_LAPACK_TARGETS= -endif - -ifeq ($(BUILD_BFLOAT16),1) -GOTO_HALF_TARGETS=sbgemm.goto -else -GOTO_HALF_TARGETS= -endif - -ifeq ($(OSNAME), WINNT) - -goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto csymv.goto zsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - cher.goto zher.goto \ - cher2.goto zher2.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ - stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - cher.acml zher.acml \ - cher2.acml zher2.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ - stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - cher.atlas zher.atlas \ - cher2.atlas zher2.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ - stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl \ - srot.mkl drot.mkl csrot.mkl zdrot.mkl \ - srotm.mkl drotm.mkl \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - cher.mkl zher.mkl \ - cher2.mkl zher2.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ - stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl - -else - -goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - cher.goto zher.goto \ - cher2.goto zher2.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ - stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling \ - isamax.goto idamax.goto icamax.goto izamax.goto \ - ismax.goto idmax.goto \ - isamin.goto idamin.goto icamin.goto izamin.goto \ - ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ - smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ - smin.goto dmin.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - cher.acml zher.acml \ - cher2.acml zher2.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ - stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - cher.atlas zher.atlas \ - cher2.atlas zher2.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ - stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - cher.mkl zher.mkl \ - cher2.mkl zher2.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ - stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl - - - - -endif - -essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ - cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ - slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ - scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ - strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl - -veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ - scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ - sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ - strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ - strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ - sspr.veclib dspr.veclib \ - sspr2.veclib dspr2.veclib \ - ssyr.veclib dsyr.veclib \ - ssyr2.veclib dsyr2.veclib \ - ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ - ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ - sger.veclib dger.veclib cger.veclib zger.veclib \ - sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ - srot.veclib drot.veclib csrot.veclib zdrot.veclib \ - srotm.veclib drotm.veclib \ - saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ - scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ - sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ - sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ - sasum.veclib dasum.veclib casum.veclib zasum.veclib \ - ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ - chemv.veclib zhemv.veclib \ - chbmv.veclib zhbmv.veclib \ - chpmv.veclib zhpmv.veclib \ - chemm.veclib zhemm.veclib \ - cherk.veclib zherk.veclib \ - cher2k.veclib zher2k.veclib \ - cher.veclib zher.veclib \ - cher2.veclib zher2.veclib \ - sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ - stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ - stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ - strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ - sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ - sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ - sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ - spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ - saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib - -goto_3m :: cgemm3m.goto zgemm3m.goto - -mkl_3m :: cgemm3m.mkl zgemm3m.mkl - -all :: goto mkl atlas acml veclib - -exe : - @./Make_exe.sh - -##################################### Slinpack #################################################### -slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -slinpack.acml : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.atlas : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.mkl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.veclib : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.essl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dlinpack #################################################### -dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dlinpack.acml : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.atlas : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.mkl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.veclib : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.essl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Clinpack #################################################### - -clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -clinpack.acml : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.atlas : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.mkl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.veclib : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.essl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zlinpack #################################################### - -zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zlinpack.acml : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.atlas : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.mkl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.veclib : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.essl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scholesky ################################################### - -scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scholesky.acml : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.atlas : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.mkl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.veclib : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.essl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcholesky ################################################### - -dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcholesky.acml : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.atlas : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.mkl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.veclib : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.essl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccholesky ################################################### - -ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccholesky.acml : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.atlas : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.mkl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.veclib : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.essl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Zcholesky ################################################### - -zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcholesky.acml : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.atlas : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.mkl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.veclib : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.essl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemm #################################################### -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -endif - -sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemm.acml : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.atlas : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.mkl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.veclib : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.essl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemm #################################################### -dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemm.acml : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.atlas : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.mkl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.veclib : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.essl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemm #################################################### - -cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm.acml : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.atlas : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.mkl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.veclib : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.essl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm #################################################### - -zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm.acml : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.atlas : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.mkl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.veclib : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.essl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymm #################################################### -ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymm.acml : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.atlas : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.mkl : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.veclib : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymm #################################################### -dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymm.acml : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.atlas : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.mkl : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.veclib : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymm #################################################### - -csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymm.acml : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.atlas : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.mkl : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.veclib : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsymm #################################################### - -zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymm.acml : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.atlas : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.mkl : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.veclib : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmm #################################################### -strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmm.acml : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.atlas : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.mkl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.veclib : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.essl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmm #################################################### -dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmm.acml : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.atlas : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.mkl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.veclib : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.essl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmm #################################################### - -ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmm.acml : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.atlas : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.mkl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.veclib : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.essl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmm #################################################### - -ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmm.acml : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.atlas : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.mkl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.veclib : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.essl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsm #################################################### -strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsm.acml : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.atlas : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.mkl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.veclib : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.essl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsm #################################################### -dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsm.acml : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.atlas : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.mkl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.veclib : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.essl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsm #################################################### - -ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsm.acml : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.atlas : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.mkl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.veclib : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.essl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsm #################################################### - -ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsm.acml : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.atlas : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.mkl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.veclib : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.essl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ssyr #################################################### -ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr.acml : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.atlas : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.mkl : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.veclib : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr #################################################### -dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr.acml : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.atlas : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.mkl : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.veclib : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr #################################################### -sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr.acml : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.atlas : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.mkl : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.veclib : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr #################################################### -dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr.acml : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.atlas : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.mkl : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.veclib : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr2 #################################################### -sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr2.acml : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.atlas : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.mkl : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.veclib : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr2 #################################################### -dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr2.acml : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.atlas : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.mkl : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.veclib : dspr2.$(SUFFIX) - -##################################### Ssyr2 #################################################### -ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2.acml : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.atlas : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.mkl : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.veclib : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr2 #################################################### -dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2.acml : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.atlas : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.mkl : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.veclib : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyrk #################################################### -ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyrk.acml : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.atlas : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.mkl : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.veclib : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyrk #################################################### -dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyrk.acml : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.atlas : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.mkl : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.veclib : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyrk #################################################### - -csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyrk.acml : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.atlas : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.mkl : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.veclib : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyrk #################################################### - -zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyrk.acml : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.atlas : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.mkl : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.veclib : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyr2k #################################################### -ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2k.acml : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.atlas : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.mkl : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.veclib : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyr2k #################################################### -dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2k.acml : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.atlas : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.mkl : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.veclib : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyr2k #################################################### - -csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyr2k.acml : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.atlas : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.mkl : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.veclib : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyr2k #################################################### - -zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyr2k.acml : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.atlas : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.mkl : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.veclib : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemm #################################################### - -chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemm.acml : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.atlas : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.mkl : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.veclib : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemm #################################################### - -zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemm.acml : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.atlas : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.mkl : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.veclib : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cherk #################################################### - -cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cherk.acml : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.atlas : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.mkl : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.veclib : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zherk #################################################### - -zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zherk.acml : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.atlas : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.mkl : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.veclib : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2k #################################################### - -cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2k.acml : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.atlas : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.mkl : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.veclib : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2k #################################################### - -zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2k.acml : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.atlas : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.mkl : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.veclib : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher #################################################### - -cher.goto : cher.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher.acml : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.atlas : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.mkl : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.veclib : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher #################################################### - -zher.goto : zher.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher.acml : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.atlas : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.mkl : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.veclib : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2 #################################################### - -cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2.acml : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.atlas : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.mkl : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.veclib : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2 #################################################### - -zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2.acml : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.atlas : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.mkl : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.veclib : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemv.acml : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.atlas : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.mkl : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.veclib : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemv.acml : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.atlas : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.mkl : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.veclib : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemv.acml : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.atlas : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.mkl : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.veclib : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemv.acml : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.atlas : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.mkl : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.veclib : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspmv #################################################### -sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspmv.atlas : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspmv #################################################### -dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspmv.atlas : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmv #################################################### -strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmv.acml : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.atlas : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.mkl : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.veclib : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmv #################################################### -dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmv.acml : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.atlas : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.mkl : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.veclib : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmv #################################################### - -ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmv.acml : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.atlas : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.mkl : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.veclib : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmv #################################################### - -ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmv.acml : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.atlas : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.mkl : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.veclib : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Stpmv #################################################### -stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -stpmv.acml : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.atlas : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.mkl : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.veclib : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtpmv #################################################### -dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtpmv.acml : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.atlas : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.mkl : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.veclib : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctpmv #################################################### - -ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctpmv.acml : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.atlas : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.mkl : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.veclib : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztpmv #################################################### - -ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztpmv.acml : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.atlas : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.mkl : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.veclib : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Stpsv #################################################### -stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -stpsv.acml : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.atlas : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.mkl : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.veclib : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtpsv #################################################### -dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtpsv.acml : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.atlas : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.mkl : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.veclib : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctpsv #################################################### - -ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctpsv.acml : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.atlas : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.mkl : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.veclib : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztpsv #################################################### - -ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztpsv.acml : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.atlas : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.mkl : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.veclib : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsv #################################################### -strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsv.acml : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.atlas : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.mkl : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.veclib : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsv #################################################### -dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsv.acml : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.atlas : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.mkl : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.veclib : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsv #################################################### - -ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsv.acml : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.atlas : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.mkl : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.veclib : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsv #################################################### - -ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsv.acml : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.atlas : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.mkl : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.veclib : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sger #################################################### -sger.goto : sger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sger.acml : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.atlas : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.mkl : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.veclib : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dger #################################################### -dger.goto : dger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dger.acml : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.atlas : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.mkl : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.veclib : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cger #################################################### -cger.goto : cger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cger.acml : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.atlas : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.mkl : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.veclib : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zger #################################################### -zger.goto : zger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zger.acml : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.atlas : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.mkl : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.veclib : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymv #################################################### -ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymv.acml : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.atlas : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.mkl : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.veclib : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymv.acml : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.atlas : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.mkl : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.veclib : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymv #################################################### -csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymv.acml : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.atlas : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.mkl : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.veclib : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymv.acml : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.atlas : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.mkl : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.veclib : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgeev #################################################### -sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgeev.acml : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.atlas : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.mkl : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.veclib : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgeev #################################################### -dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgeev.acml : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.atlas : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.mkl : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.veclib : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgeev #################################################### - -cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgeev.acml : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.atlas : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.mkl : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.veclib : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgeev #################################################### - -zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgeev.acml : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.atlas : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.mkl : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.veclib : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgetri #################################################### -sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgetri.acml : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.atlas : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.mkl : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.veclib : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgetri #################################################### -dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgetri.acml : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.atlas : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.mkl : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.veclib : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgetri #################################################### - -cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgetri.acml : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.atlas : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.mkl : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.veclib : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgetri #################################################### - -zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgetri.acml : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.atlas : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.mkl : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.veclib : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Spotrf #################################################### -spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -spotrf.acml : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.atlas : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.mkl : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.veclib : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dpotrf #################################################### -dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dpotrf.acml : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.atlas : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.mkl : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.veclib : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cpotrf #################################################### - -cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cpotrf.acml : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.atlas : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.mkl : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.veclib : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zpotrf #################################################### - -zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zpotrf.acml : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.atlas : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.mkl : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.veclib : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemv #################################################### - -chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemv.acml : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.atlas : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.mkl : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.veclib : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemv #################################################### - -zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemv.acml : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.atlas : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.mkl : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.veclib : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chbmv #################################################### - -chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chbmv.acml : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.atlas : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.mkl : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.veclib : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhbmv #################################################### - -zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhbmv.acml : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.atlas : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.mkl : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.veclib : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chpmv #################################################### - -chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chpmv.acml : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.atlas : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.mkl : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.veclib : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhpmv #################################################### - -zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhpmv.acml : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.atlas : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.mkl : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.veclib : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sdot.acml : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.atlas : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.mkl : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.veclib : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ddot.acml : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.atlas : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.mkl : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.veclib : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cdot.acml : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.atlas : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.mkl : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.veclib : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdot.acml : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.atlas : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.mkl : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.veclib : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srot.acml : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.atlas : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.mkl : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.veclib : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drot.acml : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.atlas : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.mkl : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.veclib : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### csrot #################################################### -csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csrot.acml : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.atlas : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.mkl : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.veclib : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### zdrot #################################################### -zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdrot.acml : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.atlas : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.mkl : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.veclib : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### srotm #################################################### -srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srotm.acml : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.atlas : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.mkl : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.veclib : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### drotm #################################################### -drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drotm.acml : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.atlas : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.mkl : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.veclib : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpy.acml : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.atlas : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.mkl : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.veclib : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpy.acml : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.atlas : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.mkl : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.veclib : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpy.acml : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.atlas : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.mkl : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.veclib : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpy.acml : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.atlas : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.mkl : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.veclib : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpby #################################################### -saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpby.acml : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.atlas : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.mkl : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.veclib : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpby #################################################### -daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpby.acml : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.atlas : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.mkl : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.veclib : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpby #################################################### - -caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpby.acml : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.atlas : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.mkl : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.veclib : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpby #################################################### - -zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpby.acml : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.atlas : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.mkl : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.veclib : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scopy.acml : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.atlas : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.mkl : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.veclib : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcopy.acml : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.atlas : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.mkl : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.veclib : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccopy.acml : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.atlas : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.mkl : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.veclib : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcopy.acml : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.atlas : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.mkl : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.veclib : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sscal.acml : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.atlas : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.mkl : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.veclib : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dscal.acml : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.atlas : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.mkl : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.veclib : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cscal.acml : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.atlas : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.mkl : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.veclib : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zscal.acml : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.atlas : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.mkl : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.veclib : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sasum.acml : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.atlas : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.mkl : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.veclib : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dasum.acml : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.atlas : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.mkl : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.veclib : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -casum.acml : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.atlas : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.mkl : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.veclib : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zasum.acml : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.atlas : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.mkl : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.veclib : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sswap.acml : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.atlas : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.mkl : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.veclib : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dswap.acml : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.atlas : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.mkl : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.veclib : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cswap.acml : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.atlas : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.mkl : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.veclib : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zswap.acml : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.atlas : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.mkl : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.veclib : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Sgesv #################################################### -sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgesv.acml : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.atlas : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.mkl : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.veclib : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgesv #################################################### -dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgesv.acml : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.atlas : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.mkl : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.veclib : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgesv #################################################### - -cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgesv.acml : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.atlas : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.mkl : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.veclib : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgesv #################################################### - -zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgesv.acml : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.atlas : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.mkl : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.veclib : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Cgemm3m #################################################### - -cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm3m.mkl : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm3m.veclib : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm3m #################################################### - -zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm3m.mkl : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm3m.veclib : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -isamax.atlas : isamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -idamax.atlas : idamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -icamax.atlas : icamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -izamax.atlas : izamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SNRM2 ############################################## -snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -snrm2.atlas : snrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## DNRM2 ############################################## -dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dnrm2.atlas : dnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Sscnrm2 ############################################## -scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scnrm2.atlas : scnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Ddznrm2 ############################################## -dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dznrm2.atlas : dznrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -################################################################################################### - -slinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -clinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ -endif - -sgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher.$(SUFFIX) : her.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher.$(SUFFIX) : her.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2.$(SUFFIX) : her2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2.$(SUFFIX) : her2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -strmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -stpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -stpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ssymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -spotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -sgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - - -cgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -snrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -scnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -dznrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread - -clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling - -include $(TOPDIR)/Makefile.tail +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +# ACML standard +#ACML=/opt/acml5.3.1/gfortran64_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML custom +#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML 6.1 custom +ACML=/home/saar/acml6.1/gfortran64_mp/lib +LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + + +# Atlas Ubuntu +#ATLAS=/usr/lib/atlas-base +#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Atlas RHEL and Fedora +ATLAS=/usr/lib64/atlas +LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Intel standard +# MKL=/opt/intel/mkl/lib/intel64 +# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Intel custom +MKL=/home/saar/intel_mkl +LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Apple vecLib +LIBVECLIB = -framework Accelerate + +ESSL=/opt/ibm/lib +#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a + +ifneq ($(NO_LAPACK), 1) +GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + csymv.goto zsymv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto +else +GOTO_LAPACK_TARGETS= +endif + +ifeq ($(BUILD_BFLOAT16),1) +GOTO_HALF_TARGETS=sbgemm.goto +else +GOTO_HALF_TARGETS= +endif + +ifeq ($(OSNAME), WINNT) + +goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sspmv.atlas dspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ + srotm.mkl drotm.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + +else + +goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto cdot.goto zdot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + ismax.goto idmax.goto \ + isamin.goto idamin.goto icamin.goto izamin.goto \ + ismin.goto idmin.goto \ + samax.goto damax.goto camax.goto zamax.goto \ + smax.goto dmax.goto \ + samin.goto damin.goto camin.goto zamin.goto \ + smin.goto dmin.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sspmv.atlas dspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + + + + +endif + +essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ + strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl + +veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ + scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ + sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ + strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ + strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ + ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ + ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ + ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ + sger.veclib dger.veclib cger.veclib zger.veclib \ + sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ + srotm.veclib drotm.veclib \ + saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ + sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ + sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ + sasum.veclib dasum.veclib casum.veclib zasum.veclib \ + ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ + chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ + chemm.veclib zhemm.veclib \ + cherk.veclib zherk.veclib \ + cher2k.veclib zher2k.veclib \ + cher.veclib zher.veclib \ + cher2.veclib zher2.veclib \ + sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ + stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ + sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ + sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ + spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib + +goto_3m :: cgemm3m.goto zgemm3m.goto + +mkl_3m :: cgemm3m.mkl zgemm3m.mkl + +all :: goto mkl atlas acml veclib + +exe : + @./Make_exe.sh + +##################################### Slinpack #################################################### +slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +slinpack.acml : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.atlas : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.mkl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.veclib : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.essl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dlinpack #################################################### +dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dlinpack.acml : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.atlas : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.mkl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.veclib : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.essl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Clinpack #################################################### + +clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +clinpack.acml : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.atlas : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.mkl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.veclib : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.essl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zlinpack #################################################### + +zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zlinpack.acml : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.atlas : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.mkl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.veclib : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.essl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scholesky ################################################### + +scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scholesky.acml : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.atlas : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.mkl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.veclib : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.essl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcholesky ################################################### + +dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcholesky.acml : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.atlas : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.mkl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.veclib : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.essl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccholesky ################################################### + +ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccholesky.acml : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.atlas : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.mkl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.veclib : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.essl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Zcholesky ################################################### + +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcholesky.acml : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.atlas : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.mkl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.veclib : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.essl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemm #################################################### +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + +sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemm.acml : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.atlas : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.mkl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.veclib : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.essl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemm #################################################### +dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemm.acml : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.atlas : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.mkl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.veclib : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.essl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemm #################################################### + +cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm.acml : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.atlas : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.mkl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.veclib : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.essl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm #################################################### + +zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm.acml : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.atlas : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.mkl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.veclib : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.essl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymm #################################################### +ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymm.acml : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.atlas : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.mkl : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.veclib : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymm #################################################### +dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymm.acml : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.atlas : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.mkl : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.veclib : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymm #################################################### + +csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymm.acml : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.atlas : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.mkl : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.veclib : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsymm #################################################### + +zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymm.acml : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.atlas : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.mkl : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.veclib : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmm #################################################### +strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmm.acml : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.atlas : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.mkl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.veclib : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.essl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmm #################################################### +dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmm.acml : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.atlas : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.mkl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.veclib : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.essl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmm #################################################### + +ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmm.acml : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.atlas : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.mkl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.veclib : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.essl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmm #################################################### + +ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmm.acml : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.atlas : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.mkl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.veclib : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.essl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsm #################################################### +strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsm.acml : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.atlas : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.mkl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.veclib : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.essl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsm #################################################### +dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsm.acml : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.atlas : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.mkl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.veclib : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.essl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsm #################################################### + +ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsm.acml : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.atlas : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.mkl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.veclib : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.essl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsm #################################################### + +ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsm.acml : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.atlas : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.mkl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.veclib : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.essl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyrk #################################################### +ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyrk.acml : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.atlas : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.mkl : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.veclib : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyrk #################################################### +dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyrk.acml : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.atlas : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.mkl : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.veclib : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyrk #################################################### + +csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyrk.acml : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.atlas : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.mkl : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.veclib : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyrk #################################################### + +zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyrk.acml : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.atlas : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.mkl : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.veclib : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyr2k #################################################### +ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2k.acml : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.atlas : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.mkl : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.veclib : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyr2k #################################################### +dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2k.acml : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.atlas : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.mkl : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.veclib : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyr2k #################################################### + +csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyr2k.acml : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.atlas : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.mkl : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.veclib : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyr2k #################################################### + +zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyr2k.acml : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.atlas : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.mkl : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.veclib : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemm #################################################### + +chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemm.acml : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.atlas : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.mkl : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.veclib : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemm #################################################### + +zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemm.acml : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.atlas : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.mkl : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.veclib : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cherk #################################################### + +cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cherk.acml : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.atlas : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.mkl : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.veclib : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zherk #################################################### + +zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zherk.acml : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.atlas : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.mkl : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.veclib : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2k #################################################### + +cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2k.acml : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.atlas : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.mkl : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.veclib : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2k #################################################### + +zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2k.acml : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.atlas : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.mkl : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.veclib : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher #################################################### + +cher.goto : cher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher.acml : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.atlas : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.mkl : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.veclib : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher #################################################### + +zher.goto : zher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher.acml : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.atlas : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.mkl : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.veclib : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2 #################################################### + +cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2.acml : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.atlas : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.mkl : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.veclib : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2 #################################################### + +zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2.acml : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.atlas : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.mkl : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.veclib : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemv.acml : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.atlas : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.mkl : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemv.acml : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.atlas : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.mkl : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemv.acml : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.atlas : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.mkl : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemv.acml : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.atlas : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.mkl : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspmv #################################################### +sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspmv.atlas : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspmv #################################################### +dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspmv.atlas : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Stpmv #################################################### +stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpmv.acml : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.atlas : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.mkl : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.veclib : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpmv #################################################### +dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpmv.acml : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.atlas : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.mkl : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.veclib : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpmv #################################################### + +ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpmv.acml : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.atlas : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.mkl : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.veclib : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpmv #################################################### + +ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpmv.acml : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.atlas : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.mkl : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.veclib : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpsv #################################################### +stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpsv.acml : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.atlas : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.mkl : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.veclib : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpsv #################################################### +dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpsv.acml : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.atlas : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.mkl : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.veclib : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpsv #################################################### + +ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpsv.acml : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.atlas : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.mkl : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.veclib : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpsv #################################################### + +ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpsv.acml : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.atlas : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.mkl : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.veclib : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sger #################################################### +sger.goto : sger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sger.acml : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.atlas : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.mkl : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.veclib : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dger #################################################### +dger.goto : dger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dger.acml : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.atlas : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.mkl : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.veclib : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cger #################################################### +cger.goto : cger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cger.acml : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.atlas : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.mkl : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.veclib : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zger #################################################### +zger.goto : zger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zger.acml : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.atlas : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.mkl : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.veclib : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymv #################################################### +ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymv.acml : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.atlas : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.mkl : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.veclib : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymv.acml : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.atlas : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.mkl : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.veclib : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymv #################################################### +csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymv.acml : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.atlas : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.mkl : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.veclib : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymv.acml : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.atlas : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.mkl : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.veclib : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgeev #################################################### +sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgeev.acml : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.atlas : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.mkl : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.veclib : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgeev #################################################### +dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgeev.acml : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.atlas : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.mkl : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.veclib : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgeev #################################################### + +cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgeev.acml : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.atlas : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.mkl : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.veclib : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgeev #################################################### + +zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgeev.acml : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.atlas : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.mkl : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.veclib : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgetri #################################################### +sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgetri.acml : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.atlas : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.mkl : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.veclib : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgetri #################################################### +dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgetri.acml : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.atlas : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.mkl : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.veclib : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgetri #################################################### + +cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgetri.acml : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.atlas : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.mkl : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.veclib : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgetri #################################################### + +zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgetri.acml : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.atlas : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.mkl : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.veclib : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Spotrf #################################################### +spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +spotrf.acml : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.atlas : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.mkl : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.veclib : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dpotrf #################################################### +dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dpotrf.acml : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.atlas : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.mkl : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.veclib : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cpotrf #################################################### + +cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cpotrf.acml : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.atlas : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.mkl : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.veclib : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zpotrf #################################################### + +zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zpotrf.acml : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.atlas : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.mkl : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.veclib : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemv #################################################### + +chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemv.acml : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.atlas : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.mkl : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.veclib : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemv #################################################### + +zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemv.acml : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.atlas : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.mkl : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.veclib : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### + +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sdot.acml : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.atlas : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.mkl : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.veclib : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ddot.acml : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.atlas : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.mkl : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.veclib : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cdot.acml : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.atlas : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.mkl : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.veclib : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdot.acml : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.atlas : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.mkl : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.veclib : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srot.acml : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.atlas : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.mkl : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.veclib : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drot.acml : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.atlas : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.mkl : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.veclib : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpy.acml : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.atlas : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.mkl : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.veclib : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpy.acml : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.atlas : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.mkl : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.veclib : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpy.acml : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.atlas : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.mkl : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.veclib : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpy.acml : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.atlas : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.mkl : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.veclib : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### + +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scopy.acml : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.atlas : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.mkl : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.veclib : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcopy.acml : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.atlas : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.mkl : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.veclib : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccopy.acml : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.atlas : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.mkl : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.veclib : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcopy.acml : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.atlas : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.mkl : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.veclib : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sscal.acml : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.atlas : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.mkl : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.veclib : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dscal.acml : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.atlas : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.mkl : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.veclib : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cscal.acml : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.atlas : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.mkl : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.veclib : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zscal.acml : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.atlas : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.mkl : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.veclib : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sasum.acml : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.atlas : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.mkl : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.veclib : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dasum.acml : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.atlas : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.mkl : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.veclib : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +casum.acml : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.atlas : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.mkl : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.veclib : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zasum.acml : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.atlas : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.mkl : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.veclib : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sswap.acml : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.atlas : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.mkl : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.veclib : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dswap.acml : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.atlas : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.mkl : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.veclib : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cswap.acml : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.atlas : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.mkl : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.veclib : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zswap.acml : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.atlas : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.mkl : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.veclib : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Sgesv #################################################### +sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgesv.acml : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.atlas : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.mkl : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.veclib : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgesv #################################################### +dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgesv.acml : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.atlas : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.mkl : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.veclib : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgesv #################################################### + +cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgesv.acml : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.atlas : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.mkl : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.veclib : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgesv #################################################### + +zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgesv.acml : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.atlas : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.mkl : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.veclib : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Cgemm3m #################################################### + +cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm3m.mkl : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm3m.veclib : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm3m #################################################### + +zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm3m.mkl : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm3m.veclib : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMAX ############################################## +camax.goto : camax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMAX ############################################## +zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +################################################################################################### + +slinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +clinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + +sgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +stpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +stpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ssymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +spotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +sgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + + +cgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread + +clean :: + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling + +include $(TOPDIR)/Makefile.tail diff --git a/benchmark/amax.c b/benchmark/amax.c index 29310dd71..446ba4c07 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -1,133 +1,133 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef AMAX - -#ifdef COMPLEX -#ifdef DOUBLE -#define AMAX BLASFUNC(dzamax) -#else -#define AMAX BLASFUNC(scamax) -#endif -#else -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif -#endif - -int main(int argc, char *argv[]) -{ - - FLOAT *x; - blasint m, i; - blasint inc_x = 1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1, timeg; - - argc--; - argv++; - - if (argc > 0) - { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) - { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) - { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) - { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) - { - - timeg = 0; - fprintf(stderr, " %6d : ", (int)m); - - for (l = 0; l < loops; l++) - { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) - { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - begin(); - AMAX(&m, x, &inc_x); - end(); - timeg += getsec(); - } - - timeg /= loops; - - fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef AMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMAX BLASFUNC(dzamax) +#else +#define AMAX BLASFUNC(scamax) +#endif +#else +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x; + blasint m, i; + blasint inc_x = 1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) + { + + timeg = 0; + fprintf(stderr, " %6d : ", (int)m); + + for (l = 0; l < loops; l++) + { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + begin(); + AMAX(&m, x, &inc_x); + end(); + timeg += getsec(); + } + + timeg /= loops; + + fprintf(stderr, + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/amin.c b/benchmark/amin.c index 54a1d266a..44f15a7f8 100644 --- a/benchmark/amin.c +++ b/benchmark/amin.c @@ -1,137 +1,137 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef AMIN - -#ifdef COMPLEX -#ifdef DOUBLE -#define AMIN BLASFUNC(dzamin) -#else -#define AMIN BLASFUNC(scamin) -#endif -#else -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif -#endif - -int main(int argc, char *argv[]) -{ - - FLOAT *x; - blasint m, i; - blasint inc_x = 1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1, timeg; - - argc--; - argv++; - - if (argc > 0) - { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) - { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) - { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) - { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) - { - - timeg = 0; - - fprintf(stderr, " %6d : ", (int)m); - - for (l = 0; l < loops; l++) - { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) - { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - begin(); - - AMIN(&m, x, &inc_x); - - end(); - - timeg += getsec(); - } - - timeg /= loops; - - fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef AMIN + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMIN BLASFUNC(dzamin) +#else +#define AMIN BLASFUNC(scamin) +#endif +#else +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x; + blasint m, i; + blasint inc_x = 1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) + { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l = 0; l < loops; l++) + { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + begin(); + + AMIN(&m, x, &inc_x); + + end(); + + timeg += getsec(); + } + + timeg /= loops; + + fprintf(stderr, + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index 35249bdf9..7bf047abd 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -1,134 +1,134 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef HBMV - -#ifdef DOUBLE -#define HBMV BLASFUNC(zhbmv) -#else -#define HBMV BLASFUNC(chbmv) -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {0.0, 0.0}; - blasint k = 1; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1, inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - if ((p = getenv("OPENBLAS_K"))) k = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", - from, to, step, uplo, k, inc_x, inc_y, loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m, (int)m); - - for(j = 0; j < m; j++) { - for(i = 0; i < m * COMPSIZE; i++) { - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l = 0; l < loops; l++) { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - begin(); - - HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - - end(); - - timeg += getsec(); - - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef HBMV + +#ifdef DOUBLE +#define HBMV BLASFUNC(zhbmv) +#else +#define HBMV BLASFUNC(chbmv) +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {0.0, 0.0}; + blasint k = 1; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_K"))) k = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, uplo, k, inc_x, inc_y, loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + begin(); + + HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); + + end(); + + timeg += getsec(); + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index 907e2adc4..0dc296ccc 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -1,133 +1,133 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef HPMV - -#ifdef DOUBLE -#define HPMV BLASFUNC(zhpmv) -#else -#define HPMV BLASFUNC(chpmv) -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1, inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m, (int)m); - - for(j = 0; j < m; j++) { - for(i = 0; i < m * COMPSIZE; i++) { - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l = 0; l < loops; l++) { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - begin(); - - HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - - end(); - - time1 = getsec(); - - timeg += time1; - - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef HPMV + +#ifdef DOUBLE +#define HPMV BLASFUNC(zhpmv) +#else +#define HPMV BLASFUNC(chpmv) +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + begin(); + + HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); + + end(); + + time1 = getsec(); + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/iamin.c b/benchmark/iamin.c index a57638ecc..2384641a5 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -1,120 +1,120 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef IAMIN - -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) - inc_y = atoi(p); - - fprintf( - stderr, - "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", - from, to, step, inc_x, inc_y, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == - NULL) { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - - if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == - NULL) { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) { - - timeg = 0; - - fprintf(stderr, " %6d : ", (int)m); - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - for (l = 0; l < loops; l++) { - begin(); - - ROTM(&m, x, &inc_x, y, &inc_y, param); - - end(); - - time1 = getsec(); - - timeg += time1; - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops %10.6f sec\n", - COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef ROTM + +#ifdef DOUBLE +#define ROTM BLASFUNC(drotm) +#else +#define ROTM BLASFUNC(srotm) +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x = 1, inc_y = 1; + FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) + inc_y = atoi(p); + + fprintf( + stderr, + "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, inc_x, inc_y, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + + if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + begin(); + + ROTM(&m, x, &inc_x, y, &inc_y, param); + + end(); + + time1 = getsec(); + + timeg += time1; + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} diff --git a/benchmark/spmv.c b/benchmark/spmv.c index e4dcbf4ae..1e62952ef 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -1,146 +1,146 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef SPMV - -#ifndef COMPLEX - -#ifdef DOUBLE -#define SPMV BLASFUNC(dspmv) -#else -#define SPMV BLASFUNC(sspmv) -#endif - -#else - -#ifdef DOUBLE -#define SPMV BLASFUNC(zspmv) -#else -#define SPMV BLASFUNC(cspmv) -#endif - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m,(int)m); - - for(j = 0; j < m; j++){ - for(i = 0; i < m * COMPSIZE; i++){ - a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l Date: Thu, 17 Nov 2022 18:06:17 +0100 Subject: [PATCH 103/154] Fix errors in LAPACKE ?tpmqrt for row major matrices (Reference-LAPACK PR540) --- .../LAPACKE/src/lapacke_ctpmqrt_work.c | 32 ++++++++++++------- .../LAPACKE/src/lapacke_dtpmqrt_work.c | 32 ++++++++++++------- .../LAPACKE/src/lapacke_stpmqrt_work.c | 32 ++++++++++++------- .../LAPACKE/src/lapacke_ztpmqrt_work.c | 32 ++++++++++++------- 4 files changed, 80 insertions(+), 48 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c index 5ec948e7b..e01664bdf 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c @@ -50,16 +50,24 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); lapack_complex_float* v_t = NULL; lapack_complex_float* t_t = NULL; lapack_complex_float* a_t = NULL; lapack_complex_float* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; @@ -69,7 +77,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; @@ -87,13 +95,13 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_0; } t_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,nb) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -105,10 +113,10 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_ctpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -116,7 +124,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_cge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c index d9ee6226b..366acd369 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c @@ -48,16 +48,24 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); double* v_t = NULL; double* t_t = NULL; double* a_t = NULL; double* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; @@ -67,7 +75,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; @@ -83,12 +91,12 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } - t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,nb) ); + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -99,10 +107,10 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_dtpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -110,7 +118,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c index 095fbdcd9..c5a3a1496 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c @@ -48,16 +48,24 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); float* v_t = NULL; float* t_t = NULL; float* a_t = NULL; float* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; @@ -67,7 +75,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; @@ -83,12 +91,12 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } - t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,nb) ); + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -99,10 +107,10 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_stpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -110,7 +118,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_sge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c index 643ae1d9d..104efa8f3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c @@ -50,16 +50,24 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); lapack_complex_double* v_t = NULL; lapack_complex_double* t_t = NULL; lapack_complex_double* a_t = NULL; lapack_complex_double* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; @@ -69,7 +77,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; @@ -87,13 +95,13 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_0; } t_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,nb) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -105,10 +113,10 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_ztpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -116,7 +124,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); From 6c9dbe5afa8ba39a93734ad27188cba4048f81a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Nov 2022 18:09:49 +0100 Subject: [PATCH 104/154] Add a LAPACKE interface for ?LANGB (Reference-LAPACK PR725) --- lapack-netlib/LAPACKE/src/lapacke_clangb.c | 73 ++++++++++++++++ .../LAPACKE/src/lapacke_clangb_work.c | 84 +++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_dlangb.c | 73 ++++++++++++++++ .../LAPACKE/src/lapacke_dlangb_work.c | 83 ++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_slangb.c | 73 ++++++++++++++++ .../LAPACKE/src/lapacke_slangb_work.c | 83 ++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_zlangb.c | 73 ++++++++++++++++ 7 files changed, 542 insertions(+) create mode 100644 lapack-netlib/LAPACKE/src/lapacke_clangb.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_clangb_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dlangb.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_slangb.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_slangb_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zlangb.c diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb.c b/lapack-netlib/LAPACKE/src/lapacke_clangb.c new file mode 100644 index 000000000..0d61575aa --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_clangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function clangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ) +{ + lapack_int info = 0; + float res = 0.; + float* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_clangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_clangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_clangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c new file mode 100644 index 000000000..b5b2cf816 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c @@ -0,0 +1,84 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function clangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ) +{ + lapack_int info = 0; + float res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_clangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + float* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_clangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c new file mode 100644 index 000000000..ca16ea7f4 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ) +{ + lapack_int info = 0; + double res = 0.; + double* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dlangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_dlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dlangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c new file mode 100644 index 000000000..ba04c2b62 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ) +{ + lapack_int info = 0; + double res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_dlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + double* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_dlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb.c b/lapack-netlib/LAPACKE/src/lapacke_slangb.c new file mode 100644 index 000000000..9ba3f30d8 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_slangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function slangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ) +{ + lapack_int info = 0; + float res = 0.; + float* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_slangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_slangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_slangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c new file mode 100644 index 000000000..7ef86e9d9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function slangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ) +{ + lapack_int info = 0; + float res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_slangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + float* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_slangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c new file mode 100644 index 000000000..3a22ad982 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ) +{ + lapack_int info = 0; + double res = 0.; + double* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zlangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_zlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zlangb", info ); + } + return res; +} From e4a31c0d23edbbc518c940b88623e6067fe9d0a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Nov 2022 18:15:04 +0100 Subject: [PATCH 105/154] add ?LANGB interface (Reference-LAPACK PR725) --- cmake/lapacke.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index c740eceb4..3a9352197 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -318,6 +318,8 @@ set(CSRC lapacke_clacn2.c lapacke_clag2z.c lapacke_clag2z_work.c + lapacke_clangb.c + lapacke_clangb_work.c lapacke_clange.c lapacke_clange_work.c lapacke_clanhe.c @@ -803,6 +805,8 @@ set(DSRC lapacke_dlag2s_work.c lapacke_dlamch.c lapacke_dlamch_work.c + lapacke_dlangb.c + lapacke_dlangb_work.c lapacke_dlange.c lapacke_dlange_work.c lapacke_dlansy.c @@ -1381,6 +1385,8 @@ set(SSRC lapacke_slag2d_work.c lapacke_slamch.c lapacke_slamch_work.c + lapacke_slangb.c + lapacke_slangb_work.c lapacke_slange.c lapacke_slange_work.c lapacke_slansy.c @@ -2089,6 +2095,8 @@ set(ZSRC lapacke_zlacrm_work.c lapacke_zlag2c.c lapacke_zlag2c_work.c + lapacke_zlangb.c + lapacke_zlangb_work.c lapacke_zlange.c lapacke_zlange_work.c lapacke_zlanhe.c From 48c9c6efb92ee9abb2c835cca1167bb2673d2dfd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Nov 2022 18:19:14 +0100 Subject: [PATCH 106/154] Add ?LANGB interface (Reference-LAPACK PR725) --- lapack-netlib/LAPACKE/src/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 7f827e1c9..9c02c1445 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -358,6 +358,8 @@ lapacke_clacrm.o \ lapacke_clacrm_work.o \ lapacke_clag2z.o \ lapacke_clag2z_work.o \ +lapacke_clangb.o \ +lapacke_clangb_work.o \ lapacke_clange.o \ lapacke_clange_work.o \ lapacke_clanhe.o \ @@ -842,6 +844,8 @@ lapacke_dlag2s.o \ lapacke_dlag2s_work.o \ lapacke_dlamch.o \ lapacke_dlamch_work.o \ +lapacke_dlangb.o \ +lapacke_dlangb_work.o \ lapacke_dlange.o \ lapacke_dlange_work.o \ lapacke_dlansy.o \ @@ -1414,6 +1418,8 @@ lapacke_slacpy.o \ lapacke_slacpy_work.o \ lapacke_slamch.o \ lapacke_slamch_work.o \ +lapacke_slangb.o \ +lapacke_slangb_work.o \ lapacke_slange.o \ lapacke_slange_work.o \ lapacke_slansy.o \ @@ -2116,6 +2122,8 @@ lapacke_zlacrm.o \ lapacke_zlacrm_work.o \ lapacke_zlag2c.o \ lapacke_zlag2c_work.o \ +lapacke_zlangb.o \ +lapacke_zlangb_work.o \ lapacke_zlange.o \ lapacke_zlange_work.o \ lapacke_zlanhe.o \ From 9fe75af5280a0cb658f89c0fad3d6edb5f7b421a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Nov 2022 19:46:53 +0100 Subject: [PATCH 107/154] Add a LAPACKE interface for ?LANGB (Reference-LAPACK PR725) --- .../LAPACKE/src/lapacke_zlangb_work.c | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c new file mode 100644 index 000000000..d64fb482d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c @@ -0,0 +1,84 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ) +{ + lapack_int info = 0; + double res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_zlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + double* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_zlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + } + return res; +} From 35295912a3f1b83ba8fd22f1fe2fccce6ff4a201 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 14:57:54 +0100 Subject: [PATCH 108/154] Define type conversions explicitly (Reference-LAPACK PR 703) --- lapack-netlib/SRC/cgebak.f | 4 ++-- lapack-netlib/SRC/cgees.f | 2 +- lapack-netlib/SRC/cgeesx.f | 2 +- lapack-netlib/SRC/cgejsv.f | 36 +++++++++++++++++----------------- lapack-netlib/SRC/cggbak.f | 8 ++++---- lapack-netlib/SRC/cggbal.f | 4 ++-- lapack-netlib/SRC/cggglm.f | 2 +- lapack-netlib/SRC/cgghd3.f | 2 +- lapack-netlib/SRC/cgglse.f | 2 +- lapack-netlib/SRC/cggqrf.f | 2 +- lapack-netlib/SRC/chegvd.f | 6 +++--- lapack-netlib/SRC/chesv_rk.f | 2 +- lapack-netlib/SRC/chpgvd.f | 6 +++--- lapack-netlib/SRC/csysv.f | 2 +- lapack-netlib/SRC/csysv_rk.f | 2 +- lapack-netlib/SRC/csysv_rook.f | 2 +- lapack-netlib/SRC/cungbr.f | 2 +- 17 files changed, 43 insertions(+), 43 deletions(-) diff --git a/lapack-netlib/SRC/cgebak.f b/lapack-netlib/SRC/cgebak.f index 201dbfcec..4348d5ea4 100644 --- a/lapack-netlib/SRC/cgebak.f +++ b/lapack-netlib/SRC/cgebak.f @@ -238,7 +238,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -252,7 +252,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f index 359fa2afe..71acfdba3 100644 --- a/lapack-netlib/SRC/cgees.f +++ b/lapack-netlib/SRC/cgees.f @@ -282,7 +282,7 @@ * CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = REAL( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f index 1113563ba..782e36747 100644 --- a/lapack-netlib/SRC/cgeesx.f +++ b/lapack-netlib/SRC/cgeesx.f @@ -337,7 +337,7 @@ * CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = REAL( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 25ab81302..e37b25b6b 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -704,11 +704,11 @@ IF ( LQUERY ) THEN CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_CGEQP3 = REAL( CDUMMY(1) ) + LWRK_CGEQP3 = INT( CDUMMY(1) ) CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_CGEQRF = REAL( CDUMMY(1) ) + LWRK_CGEQRF = INT( CDUMMY(1) ) CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_CGELQF = REAL( CDUMMY(1) ) + LWRK_CGELQF = INT( CDUMMY(1) ) END IF MINWRK = 2 OPTWRK = 2 @@ -724,7 +724,7 @@ IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, $ N+LWRK_CGEQRF, LWRK_CGESVJ ) @@ -760,10 +760,10 @@ IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMLQ = REAL( CDUMMY(1) ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, $ N+LWRK_CGELQF, 2*N+LWRK_CGEQRF, @@ -799,10 +799,10 @@ IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF, $ LWRK_CGESVJ, LWRK_CUNMQRM ) @@ -861,26 +861,26 @@ IF ( LQUERY ) THEN CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQR = REAL( CDUMMY(1) ) + LWRK_CUNMQR = INT( CDUMMY(1) ) IF ( .NOT. JRACC ) THEN CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_CGEQP3N = REAL( CDUMMY(1) ) + LWRK_CGEQP3N = INT( CDUMMY(1) ) CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJU = REAL( CDUMMY(1) ) + LWRK_CGESVJU = INT( CDUMMY(1) ) CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJV = REAL( CDUMMY(1) ) + LWRK_CGESVJV = INT( CDUMMY(1) ) CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMLQ = REAL( CDUMMY(1) ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, @@ -909,13 +909,13 @@ ELSE CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJV = REAL( CDUMMY(1) ) + LWRK_CGESVJV = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMQR = REAL( CDUMMY(1) ) + LWRK_CUNMQR = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, $ 2*N+LWRK_CGEQRF, 2*N+N**2, diff --git a/lapack-netlib/SRC/cggbak.f b/lapack-netlib/SRC/cggbak.f index e8ac34805..159449601 100644 --- a/lapack-netlib/SRC/cggbak.f +++ b/lapack-netlib/SRC/cggbak.f @@ -253,7 +253,7 @@ IF( ILO.EQ.1 ) $ GO TO 50 DO 40 I = ILO - 1, 1, -1 - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -263,7 +263,7 @@ IF( IHI.EQ.N ) $ GO TO 70 DO 60 I = IHI + 1, N - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 60 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -277,7 +277,7 @@ IF( ILO.EQ.1 ) $ GO TO 90 DO 80 I = ILO - 1, 1, -1 - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 80 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -287,7 +287,7 @@ IF( IHI.EQ.N ) $ GO TO 110 DO 100 I = IHI + 1, N - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 100 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/cggbal.f b/lapack-netlib/SRC/cggbal.f index c7a232415..66ba7a881 100644 --- a/lapack-netlib/SRC/cggbal.f +++ b/lapack-netlib/SRC/cggbal.f @@ -535,7 +535,7 @@ IRAB = ICAMAX( N-ILO+1, B( I, ILO ), LDB ) RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) ) LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE ) - IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) + IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) ) IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB ) LSCALE( I ) = SCLFAC**IR ICAB = ICAMAX( IHI, A( 1, I ), 1 ) @@ -543,7 +543,7 @@ ICAB = ICAMAX( IHI, B( 1, I ), 1 ) CAB = MAX( CAB, ABS( B( ICAB, I ) ) ) LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE ) - JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) + JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) ) JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB ) RSCALE( I ) = SCLFAC**JC 360 CONTINUE diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 3efca1e71..fb384b651 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -289,7 +289,7 @@ * CALL CGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = REAL( WORK( M+NP+1 ) ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**H*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 76d7de4ce..1074b4828 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -511,7 +511,7 @@ * IF( JJ.GT.0 ) THEN DO I = JJ, 1, -1 - C = DBLE( A( J+1+I, J ) ) + C = REAL( A( J+1+I, J ) ) CALL CROT( IHI-TOP, A( TOP+1, J+I+1 ), 1, $ A( TOP+1, J+I ), 1, C, $ -CONJG( B( J+1+I, J ) ) ) diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f index 4785941db..cca20dfed 100644 --- a/lapack-netlib/SRC/cgglse.f +++ b/lapack-netlib/SRC/cgglse.f @@ -276,7 +276,7 @@ * CALL CGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = REAL( WORK( P+MN+1 ) ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**H *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index febd9be8d..0185f4e0d 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL CGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = REAL( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**H*B. * diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index 0c708190c..4b7f43d52 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -360,9 +360,9 @@ CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL CHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, INFO ) - LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) - LROPT = MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) - LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) + LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) ) + LROPT = INT( MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) ) + LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f index a659c8e79..e123fa299 100644 --- a/lapack-netlib/SRC/chesv_rk.f +++ b/lapack-netlib/SRC/chesv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index 754be31ed..65d08b783 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -335,9 +335,9 @@ CALL CHPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, INFO ) - LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) - LRWMIN = MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) - LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) + LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) ) + LRWMIN = INT( MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) ) + LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f index 6f175e381..4ddabf62f 100644 --- a/lapack-netlib/SRC/csysv.f +++ b/lapack-netlib/SRC/csysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f index 793e39df5..ef5334dcd 100644 --- a/lapack-netlib/SRC/csysv_rk.f +++ b/lapack-netlib/SRC/csysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f index daa9f27c4..aad594e21 100644 --- a/lapack-netlib/SRC/csysv_rook.f +++ b/lapack-netlib/SRC/csysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index c973d0b0a..a31a53d79 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -233,7 +233,7 @@ END IF END IF END IF - LWKOPT = REAL( WORK( 1 ) ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * From 08bc43c73d43ab0f20595b705c1b07a2ddabf41e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 15:04:30 +0100 Subject: [PATCH 109/154] Define type conversions explicitly (Reference-LAPACK PR 703) --- lapack-netlib/SRC/dgebak.f | 4 ++-- lapack-netlib/SRC/dgees.f | 2 +- lapack-netlib/SRC/dgeesx.f | 2 +- lapack-netlib/SRC/dgelss.f | 26 +++++++++++++------------- lapack-netlib/SRC/dggglm.f | 2 +- lapack-netlib/SRC/dgglse.f | 2 +- lapack-netlib/SRC/dggqrf.f | 2 +- lapack-netlib/SRC/dggrqf.f | 2 +- lapack-netlib/SRC/dlag2s.f | 9 ++++++--- lapack-netlib/SRC/dlat2s.f | 7 +++++-- lapack-netlib/SRC/dorgbr.f | 2 +- lapack-netlib/SRC/dspgvd.f | 4 ++-- lapack-netlib/SRC/dsygvd.f | 4 ++-- lapack-netlib/SRC/dsysv.f | 2 +- lapack-netlib/SRC/dsysv_rk.f | 2 +- lapack-netlib/SRC/dsysv_rook.f | 2 +- 16 files changed, 40 insertions(+), 34 deletions(-) diff --git a/lapack-netlib/SRC/dgebak.f b/lapack-netlib/SRC/dgebak.f index e978d7af2..9c086794a 100644 --- a/lapack-netlib/SRC/dgebak.f +++ b/lapack-netlib/SRC/dgebak.f @@ -236,7 +236,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -250,7 +250,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/dgees.f b/lapack-netlib/SRC/dgees.f index 82b9d6ee4..24739b1cf 100644 --- a/lapack-netlib/SRC/dgees.f +++ b/lapack-netlib/SRC/dgees.f @@ -302,7 +302,7 @@ * CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/dgeesx.f b/lapack-netlib/SRC/dgeesx.f index 08fbb6468..f3677fcb3 100644 --- a/lapack-netlib/SRC/dgeesx.f +++ b/lapack-netlib/SRC/dgeesx.f @@ -382,7 +382,7 @@ * CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f index 8ed703fcf..c4190f2e0 100644 --- a/lapack-netlib/SRC/dgelss.f +++ b/lapack-netlib/SRC/dgelss.f @@ -254,11 +254,11 @@ * * Compute space needed for DGEQRF CALL DGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_DGEQRF=DUM(1) + LWORK_DGEQRF = INT( DUM(1) ) * Compute space needed for DORMQR CALL DORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_DORMQR=DUM(1) + LWORK_DORMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + LWORK_DGEQRF ) MAXWRK = MAX( MAXWRK, N + LWORK_DORMQR ) @@ -273,15 +273,15 @@ * Compute space needed for DGEBRD CALL DGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 3*N + LWORK_DGEBRD ) MAXWRK = MAX( MAXWRK, 3*N + LWORK_DORMBR ) @@ -305,23 +305,23 @@ * Compute space needed for DGELQF CALL DGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_DGELQF=DUM(1) + LWORK_DGELQF = INT( DUM(1) ) * Compute space needed for DGEBRD CALL DGEBRD( M, M, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) * Compute space needed for DORMLQ CALL DORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_DORMLQ=DUM(1) + LWORK_DORMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_DGELQF MAXWRK = MAX( MAXWRK, M*M + 4*M + LWORK_DGEBRD ) @@ -341,15 +341,15 @@ * Compute space needed for DGEBRD CALL DGEBRD( M, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) MAXWRK = 3*M + LWORK_DGEBRD MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORMBR ) MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORGBR ) diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index d43785d32..ae0f0e908 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -288,7 +288,7 @@ * CALL DGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = WORK( M+NP+1 ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**T*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/dgglse.f b/lapack-netlib/SRC/dgglse.f index 2fd17bbcb..28aeaf6e7 100644 --- a/lapack-netlib/SRC/dgglse.f +++ b/lapack-netlib/SRC/dgglse.f @@ -276,7 +276,7 @@ * CALL DGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = WORK( P+MN+1 ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**T *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 617af274f..39d27a5c9 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL DGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**T*B. * diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index 07f8752d8..ddf4104c5 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL DGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**T * diff --git a/lapack-netlib/SRC/dlag2s.f b/lapack-netlib/SRC/dlag2s.f index e5a930223..9e6dead49 100644 --- a/lapack-netlib/SRC/dlag2s.f +++ b/lapack-netlib/SRC/dlag2s.f @@ -34,8 +34,8 @@ *> *> \verbatim *> -*> DLAG2S converts a DOUBLE PRECISION matrix, SA, to a SINGLE -*> PRECISION matrix, A. +*> DLAG2S converts a DOUBLE PRECISION matrix, A, to a SINGLE +*> PRECISION matrix, SA. *> *> RMAX is the overflow for the SINGLE PRECISION arithmetic *> DLAG2S checks that all the entries of A are between -RMAX and @@ -128,6 +128,9 @@ REAL SLAMCH EXTERNAL SLAMCH * .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. * .. Executable Statements .. * RMAX = SLAMCH( 'O' ) @@ -137,7 +140,7 @@ INFO = 1 GO TO 30 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 10 CONTINUE 20 CONTINUE INFO = 0 diff --git a/lapack-netlib/SRC/dlat2s.f b/lapack-netlib/SRC/dlat2s.f index 3d00fe0a3..c926e9930 100644 --- a/lapack-netlib/SRC/dlat2s.f +++ b/lapack-netlib/SRC/dlat2s.f @@ -134,6 +134,9 @@ LOGICAL LSAME EXTERNAL SLAMCH, LSAME * .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. * .. Executable Statements .. * RMAX = SLAMCH( 'O' ) @@ -146,7 +149,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE @@ -157,7 +160,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 30 CONTINUE 40 CONTINUE END IF diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f index 1b242ff97..7dfd03961 100644 --- a/lapack-netlib/SRC/dorgbr.f +++ b/lapack-netlib/SRC/dorgbr.f @@ -232,7 +232,7 @@ END IF END IF END IF - LWKOPT = WORK( 1 ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f index 556326388..df215ae1a 100644 --- a/lapack-netlib/SRC/dspgvd.f +++ b/lapack-netlib/SRC/dspgvd.f @@ -307,8 +307,8 @@ CALL DSPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL DSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) - LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) - LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) + LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) ) + LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f index 61134bedc..3b38665a7 100644 --- a/lapack-netlib/SRC/dsygvd.f +++ b/lapack-netlib/SRC/dsygvd.f @@ -330,8 +330,8 @@ CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL DSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK, $ INFO ) - LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) - LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) + LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) ) + LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/dsysv.f b/lapack-netlib/SRC/dsysv.f index a6305e13c..ed6629ad9 100644 --- a/lapack-netlib/SRC/dsysv.f +++ b/lapack-netlib/SRC/dsysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL DSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysv_rk.f b/lapack-netlib/SRC/dsysv_rk.f index 05d8f7d3f..db8fd36dd 100644 --- a/lapack-netlib/SRC/dsysv_rk.f +++ b/lapack-netlib/SRC/dsysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL DSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysv_rook.f b/lapack-netlib/SRC/dsysv_rook.f index 6ebb52eae..85f293309 100644 --- a/lapack-netlib/SRC/dsysv_rook.f +++ b/lapack-netlib/SRC/dsysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL DSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF From e9b0f5a3648572db51b810afd8e0cb42993175e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 15:11:05 +0100 Subject: [PATCH 110/154] Define type conversions explicitly (Reference-LAPACK PR 703) --- lapack-netlib/SRC/sgebak.f | 4 ++-- lapack-netlib/SRC/sgees.f | 2 +- lapack-netlib/SRC/sgeesx.f | 2 +- lapack-netlib/SRC/sggbak.f | 8 ++++---- lapack-netlib/SRC/sggbal.f | 4 ++-- lapack-netlib/SRC/sggglm.f | 2 +- lapack-netlib/SRC/sgglse.f | 2 +- lapack-netlib/SRC/sggqrf.f | 2 +- lapack-netlib/SRC/sggrqf.f | 2 +- lapack-netlib/SRC/sorgbr.f | 2 +- lapack-netlib/SRC/sspgvd.f | 4 ++-- lapack-netlib/SRC/ssygvd.f | 4 ++-- lapack-netlib/SRC/ssysv.f | 2 +- lapack-netlib/SRC/ssysv_rk.f | 2 +- lapack-netlib/SRC/ssysv_rook.f | 2 +- 15 files changed, 22 insertions(+), 22 deletions(-) diff --git a/lapack-netlib/SRC/sgebak.f b/lapack-netlib/SRC/sgebak.f index b51b611a9..abb7809a3 100644 --- a/lapack-netlib/SRC/sgebak.f +++ b/lapack-netlib/SRC/sgebak.f @@ -236,7 +236,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -250,7 +250,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f index d40503f89..6febd549c 100644 --- a/lapack-netlib/SRC/sgees.f +++ b/lapack-netlib/SRC/sgees.f @@ -302,7 +302,7 @@ * CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f index 27c4338d4..6810fe7c8 100644 --- a/lapack-netlib/SRC/sgeesx.f +++ b/lapack-netlib/SRC/sgeesx.f @@ -382,7 +382,7 @@ * CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/sggbak.f b/lapack-netlib/SRC/sggbak.f index bb7f36011..8a796fdb1 100644 --- a/lapack-netlib/SRC/sggbak.f +++ b/lapack-netlib/SRC/sggbak.f @@ -252,7 +252,7 @@ $ GO TO 50 * DO 40 I = ILO - 1, 1, -1 - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -262,7 +262,7 @@ IF( IHI.EQ.N ) $ GO TO 70 DO 60 I = IHI + 1, N - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 60 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -276,7 +276,7 @@ IF( ILO.EQ.1 ) $ GO TO 90 DO 80 I = ILO - 1, 1, -1 - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 80 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -286,7 +286,7 @@ IF( IHI.EQ.N ) $ GO TO 110 DO 100 I = IHI + 1, N - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 100 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/sggbal.f b/lapack-netlib/SRC/sggbal.f index 6cfdbcdba..d7a8ef16c 100644 --- a/lapack-netlib/SRC/sggbal.f +++ b/lapack-netlib/SRC/sggbal.f @@ -522,7 +522,7 @@ IRAB = ISAMAX( N-ILO+1, B( I, ILO ), LDB ) RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) ) LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE ) - IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) + IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) ) IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB ) LSCALE( I ) = SCLFAC**IR ICAB = ISAMAX( IHI, A( 1, I ), 1 ) @@ -530,7 +530,7 @@ ICAB = ISAMAX( IHI, B( 1, I ), 1 ) CAB = MAX( CAB, ABS( B( ICAB, I ) ) ) LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE ) - JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) + JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) ) JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB ) RSCALE( I ) = SCLFAC**JC 360 CONTINUE diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index bbd032beb..56b4dba52 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -288,7 +288,7 @@ * CALL SGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = WORK( M+NP+1 ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**T*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f index 7ef8782b0..59addc3f4 100644 --- a/lapack-netlib/SRC/sgglse.f +++ b/lapack-netlib/SRC/sgglse.f @@ -276,7 +276,7 @@ * CALL SGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = WORK( P+MN+1 ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**T *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index c57b16a56..59b498da5 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL SGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**T*B. * diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index c4a78c347..8b7d4786a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL SGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**T * diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index 8f15523d4..b1a5c03a2 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -232,7 +232,7 @@ END IF END IF END IF - LWKOPT = WORK( 1 ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 9db8de08c..73862ed1b 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -307,8 +307,8 @@ CALL SSPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) - LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) - LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) + LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) ) + LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 9002df237..7c7e0de01 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -330,8 +330,8 @@ CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL SSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK, $ INFO ) - LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) - LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) + LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) ) + LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f index 5f4062e9a..06a42dfb7 100644 --- a/lapack-netlib/SRC/ssysv.f +++ b/lapack-netlib/SRC/ssysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f index 9e0487623..9a7dfa4bb 100644 --- a/lapack-netlib/SRC/ssysv_rk.f +++ b/lapack-netlib/SRC/ssysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f index b4da1101c..fb7ba8c53 100644 --- a/lapack-netlib/SRC/ssysv_rook.f +++ b/lapack-netlib/SRC/ssysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF From 4e60737c2d914de2385c66dfb097b8d3d4d73d10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 15:22:46 +0100 Subject: [PATCH 111/154] Define type conversions explicitly (Reference-LAPACK PR 703) --- lapack-netlib/SRC/zgebak.f | 4 ++-- lapack-netlib/SRC/zgees.f | 2 +- lapack-netlib/SRC/zgeesx.f | 2 +- lapack-netlib/SRC/zgejsv.f | 36 +++++++++++++++++----------------- lapack-netlib/SRC/zggglm.f | 2 +- lapack-netlib/SRC/zgglse.f | 2 +- lapack-netlib/SRC/zggqrf.f | 2 +- lapack-netlib/SRC/zggrqf.f | 2 +- lapack-netlib/SRC/zhegvd.f | 6 +++--- lapack-netlib/SRC/zhesv_rk.f | 2 +- lapack-netlib/SRC/zhpgvd.f | 6 +++--- lapack-netlib/SRC/zlag2c.f | 4 ++-- lapack-netlib/SRC/zlaic1.f | 4 ++-- lapack-netlib/SRC/zlat2c.f | 6 +++--- lapack-netlib/SRC/zsysv.f | 2 +- lapack-netlib/SRC/zsysv_rk.f | 2 +- lapack-netlib/SRC/zsysv_rook.f | 2 +- lapack-netlib/SRC/zungbr.f | 2 +- 18 files changed, 44 insertions(+), 44 deletions(-) diff --git a/lapack-netlib/SRC/zgebak.f b/lapack-netlib/SRC/zgebak.f index 9ec610efb..9a0f65a43 100644 --- a/lapack-netlib/SRC/zgebak.f +++ b/lapack-netlib/SRC/zgebak.f @@ -238,7 +238,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -252,7 +252,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/zgees.f b/lapack-netlib/SRC/zgees.f index 40fe78d34..d673087bf 100644 --- a/lapack-netlib/SRC/zgees.f +++ b/lapack-netlib/SRC/zgees.f @@ -282,7 +282,7 @@ * CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = DBLE( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/zgeesx.f b/lapack-netlib/SRC/zgeesx.f index ca4f5c913..bdd741b11 100644 --- a/lapack-netlib/SRC/zgeesx.f +++ b/lapack-netlib/SRC/zgeesx.f @@ -337,7 +337,7 @@ * CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = DBLE( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index 0c2226f9f..d1106696c 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -707,11 +707,11 @@ IF ( LQUERY ) THEN CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_ZGEQP3 = DBLE( CDUMMY(1) ) + LWRK_ZGEQP3 = INT( CDUMMY(1) ) CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_ZGEQRF = DBLE( CDUMMY(1) ) + LWRK_ZGEQRF = INT( CDUMMY(1) ) CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_ZGELQF = DBLE( CDUMMY(1) ) + LWRK_ZGELQF = INT( CDUMMY(1) ) END IF MINWRK = 2 OPTWRK = 2 @@ -727,7 +727,7 @@ IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, $ N+LWRK_ZGEQRF, LWRK_ZGESVJ ) @@ -763,10 +763,10 @@ IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMLQ = DBLE( CDUMMY(1) ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, $ N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF, @@ -802,10 +802,10 @@ IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF, $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) @@ -864,26 +864,26 @@ IF ( LQUERY ) THEN CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQR = DBLE( CDUMMY(1) ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) IF ( .NOT. JRACC ) THEN CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_ZGEQP3N = DBLE( CDUMMY(1) ) + LWRK_ZGEQP3N = INT( CDUMMY(1) ) CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJU = DBLE( CDUMMY(1) ) + LWRK_ZGESVJU = INT( CDUMMY(1) ) CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJV = DBLE( CDUMMY(1) ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMLQ = DBLE( CDUMMY(1) ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, @@ -912,13 +912,13 @@ ELSE CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJV = DBLE( CDUMMY(1) ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMQR = DBLE( CDUMMY(1) ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, $ 2*N+LWRK_ZGEQRF, 2*N+N**2, diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index 6c24131aa..62b4acdec 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -289,7 +289,7 @@ * CALL ZGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = DBLE( WORK( M+NP+1 ) ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**H*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/zgglse.f b/lapack-netlib/SRC/zgglse.f index e5869a7d4..cc558bc40 100644 --- a/lapack-netlib/SRC/zgglse.f +++ b/lapack-netlib/SRC/zgglse.f @@ -276,7 +276,7 @@ * CALL ZGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = DBLE( WORK( P+MN+1 ) ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**H *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 93b1dc0fc..0388b0874 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL ZGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = DBLE( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**H*B. * diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index a2d4a9d55..be912c772 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL ZGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = DBLE( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**H * diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f index 2e92255df..eeda656ad 100644 --- a/lapack-netlib/SRC/zhegvd.f +++ b/lapack-netlib/SRC/zhegvd.f @@ -360,9 +360,9 @@ CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL ZHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, INFO ) - LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) - LROPT = MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) - LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) + LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) ) + LROPT = INT( MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) ) + LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/zhesv_rk.f b/lapack-netlib/SRC/zhesv_rk.f index 1ec75cc04..6333e9f36 100644 --- a/lapack-netlib/SRC/zhesv_rk.f +++ b/lapack-netlib/SRC/zhesv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL ZHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f index d27cdc761..e96e39738 100644 --- a/lapack-netlib/SRC/zhpgvd.f +++ b/lapack-netlib/SRC/zhpgvd.f @@ -335,9 +335,9 @@ CALL ZHPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL ZHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, INFO ) - LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) - LRWMIN = MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) - LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) + LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) ) + LRWMIN = INT( MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) ) + LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/zlag2c.f b/lapack-netlib/SRC/zlag2c.f index ba141a98f..434590bb9 100644 --- a/lapack-netlib/SRC/zlag2c.f +++ b/lapack-netlib/SRC/zlag2c.f @@ -124,7 +124,7 @@ DOUBLE PRECISION RMAX * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, DIMAG + INTRINSIC DBLE, DIMAG, CMPLX * .. * .. External Functions .. REAL SLAMCH @@ -142,7 +142,7 @@ INFO = 1 GO TO 30 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 10 CONTINUE 20 CONTINUE INFO = 0 diff --git a/lapack-netlib/SRC/zlaic1.f b/lapack-netlib/SRC/zlaic1.f index 72948cde9..47927e778 100644 --- a/lapack-netlib/SRC/zlaic1.f +++ b/lapack-netlib/SRC/zlaic1.f @@ -348,9 +348,9 @@ B = ( ZETA2*ZETA2+ZETA1*ZETA1-ONE )*HALF C = ZETA1*ZETA1 IF( B.GE.ZERO ) THEN - T = -C / ( B+SQRT( B*B+C ) ) + T = DBLE( -C / ( B+SQRT( B*B+C ) ) ) ELSE - T = B - SQRT( B*B+C ) + T = DBLE( B - SQRT( B*B+C ) ) END IF SINE = -( ALPHA / ABSEST ) / T COSINE = -( GAMMA / ABSEST ) / ( ONE+T ) diff --git a/lapack-netlib/SRC/zlat2c.f b/lapack-netlib/SRC/zlat2c.f index 1d607dcea..a413b05c1 100644 --- a/lapack-netlib/SRC/zlat2c.f +++ b/lapack-netlib/SRC/zlat2c.f @@ -130,7 +130,7 @@ LOGICAL UPPER * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, DIMAG + INTRINSIC DBLE, DIMAG, CMPLX * .. * .. External Functions .. REAL SLAMCH @@ -151,7 +151,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE @@ -164,7 +164,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 30 CONTINUE 40 CONTINUE END IF diff --git a/lapack-netlib/SRC/zsysv.f b/lapack-netlib/SRC/zsysv.f index ed173dadc..44f1e25b1 100644 --- a/lapack-netlib/SRC/zsysv.f +++ b/lapack-netlib/SRC/zsysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL ZSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysv_rk.f b/lapack-netlib/SRC/zsysv_rk.f index df828ee33..8d9fb82c8 100644 --- a/lapack-netlib/SRC/zsysv_rk.f +++ b/lapack-netlib/SRC/zsysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL ZSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysv_rook.f b/lapack-netlib/SRC/zsysv_rook.f index 7c9fb4bf6..745339512 100644 --- a/lapack-netlib/SRC/zsysv_rook.f +++ b/lapack-netlib/SRC/zsysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL ZSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f index 3dfca43be..c42a372c5 100644 --- a/lapack-netlib/SRC/zungbr.f +++ b/lapack-netlib/SRC/zungbr.f @@ -233,7 +233,7 @@ END IF END IF END IF - LWKOPT = DBLE( WORK( 1 ) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) LWKOPT = MAX (LWKOPT, MN) END IF * From 15967809adb4275a1c5b11cd9a3fc10be3b13c3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 19:15:09 +0100 Subject: [PATCH 112/154] Define type conversions explicitly (Reference-LAPACK PR703) --- lapack-netlib/SRC/cggrqf.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index b43febc1f..5227100da 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL CGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = REAL( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**H * From 63014e99ae33751da236a7f2bf90af8113af89ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 22:31:33 +0100 Subject: [PATCH 113/154] Cast work array sizes to integer (Reference-LAPACK PR 684) --- lapack-netlib/SRC/cgelss.f | 26 +++++++++++++------------- lapack-netlib/SRC/sgelss.f | 24 ++++++++++++------------ lapack-netlib/SRC/zgelss.f | 26 +++++++++++++------------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f index 04defbb2e..da6b9092f 100644 --- a/lapack-netlib/SRC/cgelss.f +++ b/lapack-netlib/SRC/cgelss.f @@ -266,11 +266,11 @@ * * Compute space needed for CGEQRF CALL CGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_CGEQRF = REAL( DUM(1) ) + LWORK_CGEQRF = INT( DUM(1) ) * Compute space needed for CUNMQR CALL CUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_CUNMQR = REAL( DUM(1) ) + LWORK_CUNMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'CGEQRF', ' ', M, $ N, -1, -1 ) ) @@ -284,15 +284,15 @@ * Compute space needed for CGEBRD CALL CGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1), $ -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 2*N + LWORK_CGEBRD ) MAXWRK = MAX( MAXWRK, 2*N + LWORK_CUNMBR ) @@ -310,23 +310,23 @@ * Compute space needed for CGELQF CALL CGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_CGELQF = REAL( DUM(1) ) + LWORK_CGELQF = INT( DUM(1) ) * Compute space needed for CGEBRD CALL CGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) * Compute space needed for CUNMLQ CALL CUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMLQ = REAL( DUM(1) ) + LWORK_CUNMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_CGELQF MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_CGEBRD ) @@ -345,15 +345,15 @@ * Compute space needed for CGEBRD CALL CGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) MAXWRK = 2*M + LWORK_CGEBRD MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNMBR ) MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNGBR ) diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index be9e2ea11..9aed4329f 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -253,11 +253,11 @@ * * Compute space needed for SGEQRF CALL SGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_SGEQRF=DUM(1) + LWORK_SGEQRF = INT( DUM(1) ) * Compute space needed for SORMQR CALL SORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_SORMQR=DUM(1) + LWORK_SORMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + LWORK_SGEQRF ) MAXWRK = MAX( MAXWRK, N + LWORK_SORMQR ) @@ -272,15 +272,15 @@ * Compute space needed for SGEBRD CALL SGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 3*N + LWORK_SGEBRD ) MAXWRK = MAX( MAXWRK, 3*N + LWORK_SORMBR ) @@ -304,19 +304,19 @@ * Compute space needed for SGEBRD CALL SGEBRD( M, M, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) * Compute space needed for SORMLQ CALL SORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_SORMLQ=DUM(1) + LWORK_SORMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + M*ILAENV( 1, 'SGELQF', ' ', M, N, -1, $ -1 ) @@ -337,15 +337,15 @@ * Compute space needed for SGEBRD CALL SGEBRD( M, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) MAXWRK = 3*M + LWORK_SGEBRD MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORMBR ) MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORGBR ) diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f index e4aba6497..be53ba95b 100644 --- a/lapack-netlib/SRC/zgelss.f +++ b/lapack-netlib/SRC/zgelss.f @@ -266,11 +266,11 @@ * * Compute space needed for ZGEQRF CALL ZGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_ZGEQRF = DBLE( DUM(1) ) + LWORK_ZGEQRF = INT( DUM(1) ) * Compute space needed for ZUNMQR CALL ZUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_ZUNMQR = DBLE( DUM(1) ) + LWORK_ZUNMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'ZGEQRF', ' ', M, $ N, -1, -1 ) ) @@ -284,15 +284,15 @@ * Compute space needed for ZGEBRD CALL ZGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1), $ -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZGEBRD ) MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZUNMBR ) @@ -310,23 +310,23 @@ * Compute space needed for ZGELQF CALL ZGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_ZGELQF = DBLE( DUM(1) ) + LWORK_ZGELQF = INT( DUM(1) ) * Compute space needed for ZGEBRD CALL ZGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) * Compute space needed for ZUNMLQ CALL ZUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMLQ = DBLE( DUM(1) ) + LWORK_ZUNMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_ZGELQF MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_ZGEBRD ) @@ -345,15 +345,15 @@ * Compute space needed for ZGEBRD CALL ZGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) MAXWRK = 2*M + LWORK_ZGEBRD MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNMBR ) MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNGBR ) From d0afbd8d29f3405f2a670bbc72c264d4d54d5b24 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 22:34:42 +0100 Subject: [PATCH 114/154] Add new routines for ?GELST similar to ?GELS (Reference-LAPACK PR739) --- lapack-netlib/SRC/cgelst.f | 533 +++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgelst.f | 531 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgelst.f | 531 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgelst.f | 533 +++++++++++++++++++++++++++++++++++++ 4 files changed, 2128 insertions(+) create mode 100644 lapack-netlib/SRC/cgelst.f create mode 100644 lapack-netlib/SRC/dgelst.f create mode 100644 lapack-netlib/SRC/sgelst.f create mode 100644 lapack-netlib/SRC/zgelst.f diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f new file mode 100644 index 000000000..7d8e44ddf --- /dev/null +++ b/lapack-netlib/SRC/cgelst.f @@ -0,0 +1,533 @@ +*> \brief CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR +*> or LQ factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'C' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'C' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'C': the linear system involves A**H. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by CGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by CGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'C'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> modulus of elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of the modulus of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + REAL ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + REAL RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD, + $ CLASCL, CLASET, CTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'CGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = REAL( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'CGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL SLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = CLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = CLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL CLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL CLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL CGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL CTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL CTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit', + $ N, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL CGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL CTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL CTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit', + $ M, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL CLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = REAL( LWOPT ) +* + RETURN +* +* End of CGELST +* + END diff --git a/lapack-netlib/SRC/dgelst.f b/lapack-netlib/SRC/dgelst.f new file mode 100644 index 000000000..ca0e04a9b --- /dev/null +++ b/lapack-netlib/SRC/dgelst.f @@ -0,0 +1,531 @@ +*> \brief DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its transpose, using a QR or LQ +*> factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'T' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'T' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'T': the linear system involves A**T. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by DGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by DGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'T'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL LSAME, ILAENV, DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DGELQT, DGEQRT, DGEMLQT, DGEMQRT, DLABAD, + $ DLASCL, DLASET, DTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'DGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = DBLE( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'DGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL DLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = DLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = DLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL DLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL DLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL DGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ), + $ INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL DTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL DTRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL DGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL DTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL DTRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL DLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL DLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = DBLE( LWOPT ) +* + RETURN +* +* End of DGELST +* + END diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f new file mode 100644 index 000000000..5377bc720 --- /dev/null +++ b/lapack-netlib/SRC/sgelst.f @@ -0,0 +1,531 @@ +*> \brief SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its transpose, using a QR or LQ +*> factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'T' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'T' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'T': the linear system involves A**T. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by SGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by SGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'T'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + REAL ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + REAL RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD, + $ SLASCL, SLASET, STRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'SGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = REAL( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'SGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL SLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = SLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = SLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL SLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL SLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL SGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ), + $ INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL STRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL STRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL SGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL STRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL STRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL SLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL SLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = REAL( LWOPT ) +* + RETURN +* +* End of SGELST +* + END diff --git a/lapack-netlib/SRC/zgelst.f b/lapack-netlib/SRC/zgelst.f new file mode 100644 index 000000000..4dabdc91e --- /dev/null +++ b/lapack-netlib/SRC/zgelst.f @@ -0,0 +1,533 @@ +*> \brief ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR +*> or LQ factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'C' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'C' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'C': the linear system involves A**H. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by ZGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by ZGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'C'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> modulus of elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of the modulus of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZGELQT, ZGEQRT, ZGEMLQT, ZGEMQRT, DLABAD, + $ ZLASCL, ZLASET, ZTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'ZGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = DBLE( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'ZGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL DLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = ZLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = ZLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL ZLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL ZLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL ZGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL ZTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL ZTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit', + $ N, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL ZGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL ZTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL ZTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit', + $ M, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL ZLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL ZLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = DBLE( LWOPT ) +* + RETURN +* +* End of ZGELST +* + END From 1497336b203a7efa09cc788099a79d7732662fdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 22:39:16 +0100 Subject: [PATCH 115/154] Add tests for ?GELST (Reference-LAPACK PR739) --- lapack-netlib/TESTING/LIN/alahd.f | 22 +- lapack-netlib/TESTING/LIN/cdrvls.f | 320 ++++++++++++++++++++------- lapack-netlib/TESTING/LIN/cerrls.f | 61 +++++- lapack-netlib/TESTING/LIN/ddrvls.f | 339 +++++++++++++++++++++-------- lapack-netlib/TESTING/LIN/derrls.f | 61 +++++- lapack-netlib/TESTING/LIN/sdrvls.f | 333 ++++++++++++++++++++-------- lapack-netlib/TESTING/LIN/serrls.f | 61 +++++- lapack-netlib/TESTING/LIN/zdrvls.f | 333 ++++++++++++++++++++-------- lapack-netlib/TESTING/LIN/zerrls.f | 61 +++++- 9 files changed, 1234 insertions(+), 357 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index 2cc0fba06..f0423a23b 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -608,17 +608,18 @@ ELSE IF( LSAMEN( 2, P2, 'LS' ) ) THEN * * LS: Least Squares driver routines for -* LS, LSD, LSS, LSX and LSY. +* LS, LST, TSLS, LSD, LSS, LSX and LSY. * WRITE( IOUNIT, FMT = 9984 )PATH WRITE( IOUNIT, FMT = 9967 ) - WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1 + WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1, C1, C1 WRITE( IOUNIT, FMT = 9935 )1 WRITE( IOUNIT, FMT = 9931 )2 - WRITE( IOUNIT, FMT = 9933 )3 - WRITE( IOUNIT, FMT = 9935 )4 - WRITE( IOUNIT, FMT = 9934 )5 - WRITE( IOUNIT, FMT = 9932 )6 + WRITE( IOUNIT, FMT = 9919 ) + WRITE( IOUNIT, FMT = 9933 )7 + WRITE( IOUNIT, FMT = 9935 )8 + WRITE( IOUNIT, FMT = 9934 )9 + WRITE( IOUNIT, FMT = 9932 )10 WRITE( IOUNIT, FMT = 9920 ) WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) * @@ -1048,10 +1049,11 @@ $ 'check if X is in the row space of A or A'' ', $ '(overdetermined case)' ) 9929 FORMAT( ' Test ratios (1-3: ', A1, 'TZRZF):' ) - 9920 FORMAT( 3X, ' 7-10: same as 3-6', 3X, ' 11-14: same as 3-6' ) - 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-6: ', A1, - $ 'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ', - $ A1, 'GETSLS)') + 9919 FORMAT( 3X, ' 3-4: same as 1-2', 3X, ' 5-6: same as 1-2' ) + 9920 FORMAT( 3X, ' 11-14: same as 7-10', 3X, ' 15-18: same as 7-10' ) + 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-4: ', A1, + $ 'GELST, 5-6: ', A1, 'GETSLS, 7-10: ', A1, 'GELSY, 11-14: ', + $ A1, 'GETSS, 15-18: ', A1, 'GELSD)' ) 9928 FORMAT( 7X, 'where ALPHA = ( 1 + SQRT( 17 ) ) / 8' ) 9927 FORMAT( 3X, I2, ': ABS( Largest element in L )', / 12X, $ ' - ( 1 / ( 1 - ALPHA ) ) + THRESH' ) diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index 7fe189e5f..ecba705d5 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -31,7 +31,8 @@ *> *> \verbatim *> -*> CDRVLS tests the least squares driver routines CGELS, CGETSLS, CGELSS, CGELSY +*> CDRVLS tests the least squares driver routines CGELS, CGELST, +*> CGETSLS, CGELSS, CGELSY *> and CGELSD. *> \endverbatim * @@ -211,7 +212,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) REAL ONE, ZERO @@ -228,8 +229,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, LRWORK, - $ LWORK_CGELS, LWORK_CGETSLS, LWORK_CGELSS, - $ LWORK_CGELSY, LWORK_CGELSD, + $ LWORK_CGELS, LWORK_CGELST, LWORK_CGETSLS, + $ LWORK_CGELSS, LWORK_CGELSY, LWORK_CGELSD, $ LRWORK_CGELSY, LRWORK_CGELSS, LRWORK_CGELSD REAL EPS, NORMA, NORMB, RCOND * .. @@ -249,7 +250,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, CERRLS, CGELS, CGELSD, - $ CGELSS, CGELSY, CGEMM, CGETSLS, CLACPY, + $ CGELSS, CGELST, CGELSY, CGEMM, CGETSLS, CLACPY, $ CLARNV, CQRT13, CQRT15, CQRT16, CSSCAL, $ SAXPY, XLAENV * .. @@ -334,7 +335,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -361,6 +363,10 @@ CALL CGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_CGELS = INT( WQ( 1 ) ) +* Compute workspace needed for CGELST + CALL CGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_CGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for CGETSLS CALL CGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -425,21 +431,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 100 -* +* ===================================================== +* Begin test CGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test CGELS -* * Generate a matrix of scaling type ISCALE * CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -484,15 +495,20 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for CGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, RWORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for CGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN @@ -515,7 +531,7 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -524,26 +540,34 @@ $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test CGETSLS + END DO + END DO + END IF +* ===================================================== +* End test CGELS +* ===================================================== +* ===================================================== +* Begin test CGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + CALL XLAENV( 3, NXVAL( INB ) ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -560,9 +584,9 @@ IF( NCOLS.GT.0 ) THEN CALL CLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) - CALL CSCAL( NCOLS*NRHS, - $ CONE / REAL( NCOLS ), WORK, - $ 1 ) + CALL CSSCAL( NCOLS*NRHS, + $ ONE / REAL( NCOLS ), WORK, + $ 1 ) END IF CALL CGEMM( TRANS, 'No transpose', NROWS, $ NRHS, NCOLS, CONE, COPYA, LDA, @@ -578,31 +602,37 @@ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'CGETSLS ' - CALL CGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'CGELST' + CALL CGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) +* IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'CGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for CGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK2, - $ RESULT( 15 ) ) + $ LDA, B, LDB, C, LDB, RWORK, + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for CGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * * Solving LS system * - RESULT( 16 ) = CQRT17( TRANS, 1, M, N, + RESULT( 4 ) = CQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -610,7 +640,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = CQRT14( TRANS, M, N, + RESULT( 4 ) = CQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -618,21 +648,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 )TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test CGELST +* ===================================================== +* ===================================================== +* Begin test CGELSTSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO INB = 1, NNB + MB = NBVAL( INB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO IMB = 1, NNB + NB = NBVAL( IMB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'C' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL CLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL CSCAL( NCOLS*NRHS, + $ CONE / REAL( NCOLS ), + $ WORK, 1 ) + END IF + CALL CGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, CONE, + $ COPYA, LDA, WORK, LDWORK, + $ CZERO, B, LDB ) + CALL CLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL CLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL CLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'CGETSLS ' + CALL CGETSLS( TRANS, M, N, NRHS, A, + $ LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CGETSLS ', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for CGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL CLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL CQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK2, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for CGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = CQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = CQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, B, + $ LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 )TRANS, + $ M, N, NRHS, MB, NB, ITYPE, K, + $ RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test CGELSTSLS +* ==================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -680,37 +840,37 @@ * * workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = CQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = CQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK, RWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 4 ) ) + $ RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -736,38 +896,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 8 ) ) + $ RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -792,45 +952,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 12 ) ) + $ RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 80 K = 3, 14 + DO 80 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/cerrls.f b/lapack-netlib/TESTING/LIN/cerrls.f index 48e44ad86..fca943918 100644 --- a/lapack-netlib/TESTING/LIN/cerrls.f +++ b/lapack-netlib/TESTING/LIN/cerrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> CERRLS tests the error exits for the COMPLEX least squares -*> driver routines (CGELS, CGELSS, CGELSY, CGELSD). +*> driver routines (CGELS, CGELST, CGETSLS, CGELSS, CGELSY, CGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CGELS, CGELSD, CGELSS, CGELSY, CHKXER + EXTERNAL ALAESM, CHKXER, CGELS, CGELSD, CGELSS, CGELST, + $ CGELSY, CGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL CGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK ) * +* CGELST +* + SRNAMT = 'CGELST' + INFOT = 1 + CALL CGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) +* +* CGETSLS +* + SRNAMT = 'CGETSLS' + INFOT = 1 + CALL CGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) +* * CGELSS * SRNAMT = 'CGELSS' diff --git a/lapack-netlib/TESTING/LIN/ddrvls.f b/lapack-netlib/TESTING/LIN/ddrvls.f index b64930c10..b3d07d67f 100644 --- a/lapack-netlib/TESTING/LIN/ddrvls.f +++ b/lapack-netlib/TESTING/LIN/ddrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> DDRVLS tests the least squares driver routines DGELS, DGETSLS, DGELSS, DGELSY, -*> and DGELSD. +*> DDRVLS tests the least squares driver routines DGELS, DGELST, +*> DGETSLS, DGELSS, DGELSY, and DGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) DOUBLE PRECISION ONE, TWO, ZERO @@ -225,8 +225,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, - $ LWORK_DGELS, LWORK_DGETSLS, LWORK_DGELSS, - $ LWORK_DGELSY, LWORK_DGELSD + $ LWORK_DGELS, LWORK_DGELST, LWORK_DGETSLS, + $ LWORK_DGELSS, LWORK_DGELSY, LWORK_DGELSD DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. @@ -243,12 +243,12 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, DERRLS, DGELS, - $ DGELSD, DGELSS, DGELSY, DGEMM, DLACPY, - $ DLARNV, DLASRT, DQRT13, DQRT15, DQRT16, DSCAL, - $ XLAENV + $ DGELSD, DGELSS, DGELST, DGELSY, DGEMM, + $ DGETSLS, DLACPY, DLARNV, DQRT13, DQRT15, + $ DQRT16, DSCAL, XLAENV * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, INT, LOG, MAX, MIN, SQRT + INTRINSIC DBLE, INT, MAX, MIN, SQRT * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -330,7 +330,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -357,6 +358,10 @@ CALL DGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_DGELS = INT ( WQ ( 1 ) ) +* Compute workspace needed for DGELST + CALL DGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_DGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for DGETSLS CALL DGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -378,9 +383,9 @@ * Compute LIWORK workspace needed for DGELSY and DGELSD LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGETSLS, - $ LWORK_DGELSY, LWORK_DGELSS, - $ LWORK_DGELSD ) + LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGELST, + $ LWORK_DGETSLS, LWORK_DGELSY, + $ LWORK_DGELSS, LWORK_DGELSD ) END IF ENDDO ENDDO @@ -411,21 +416,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 110 -* +* ===================================================== +* Begin test DGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test DGELS -* * Generate a matrix of scaling type ISCALE * CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -469,20 +479,27 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for DGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL DQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for DGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * RESULT( 2 ) = DQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, @@ -500,35 +517,42 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9999 )TRANS, M, + WRITE( NOUT, FMT = 9999 ) TRANS, M, $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test DGETSLS + END DO + END DO + END IF +* ===================================================== +* End test DGELS +* ===================================================== +* ===================================================== +* Begin test DGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -563,31 +587,38 @@ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'DGETSLS ' - CALL DGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'DGELST' + CALL DGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'DGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for DGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL DQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, - $ RESULT( 15 ) ) + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for DGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * - RESULT( 16 ) = DQRT17( TRANS, 1, M, N, + RESULT( 4 ) = DQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -595,7 +626,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = DQRT14( TRANS, M, N, + RESULT( 4 ) = DQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -603,21 +634,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 ) TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test DGELST +* ===================================================== +* ===================================================== +* Begin test DGETSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO IMB = 1, NNB + MB = NBVAL( IMB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'T' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL DLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL DSCAL( NCOLS*NRHS, + $ ONE / DBLE( NCOLS ), + $ WORK, 1 ) + END IF + CALL DGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, ONE, + $ COPYA, LDA, WORK, LDWORK, + $ ZERO, B, LDB ) + CALL DLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL DLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL DLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'DGETSLS' + CALL DGETSLS( TRANS, M, N, NRHS, + $ A, LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DGETSLS', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for DGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL DLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL DQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for DGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = DQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = DQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, + $ B, LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 ) TRANS, + $ M, N, NRHS, MB, NB, ITYPE, + $ K, RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test DGETSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -662,37 +823,37 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = DQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = DQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 4 ) ) + $ WORK( M*NRHS+1 ), RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -716,38 +877,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 8 ) ) + $ WORK( M*NRHS+1 ), RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -776,45 +937,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 12 ) ) + $ WORK( M*NRHS+1 ), RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 90 K = 3, 14 + DO 90 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -826,6 +987,12 @@ NRUN = NRUN + 12 * 100 CONTINUE + + + + + + 110 CONTINUE 120 CONTINUE 130 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/derrls.f b/lapack-netlib/TESTING/LIN/derrls.f index a1f74dec2..09d745238 100644 --- a/lapack-netlib/TESTING/LIN/derrls.f +++ b/lapack-netlib/TESTING/LIN/derrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> DERRLS tests the error exits for the DOUBLE PRECISION least squares -*> driver routines (DGELS, SGELSS, SGELSY, SGELSD). +*> driver routines (DGELS, DGELST, DGETSLS, SGELSS, SGELSY, SGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELSY + EXTERNAL ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELST, + $ DGELSY, DGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL DGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK ) * +* DGELST +* + SRNAMT = 'DGELST' + INFOT = 1 + CALL DGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) +* +* DGETSLS +* + SRNAMT = 'DGETSLS' + INFOT = 1 + CALL DGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) +* * DGELSS * SRNAMT = 'DGELSS' diff --git a/lapack-netlib/TESTING/LIN/sdrvls.f b/lapack-netlib/TESTING/LIN/sdrvls.f index b96451503..2baf9a3fb 100644 --- a/lapack-netlib/TESTING/LIN/sdrvls.f +++ b/lapack-netlib/TESTING/LIN/sdrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> SDRVLS tests the least squares driver routines SGELS, SGETSLS, SGELSS, SGELSY, -*> and SGELSD. +*> SDRVLS tests the least squares driver routines SGELS, SGELST, +*> SGETSLS, SGELSS, SGELSY and SGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) REAL ONE, TWO, ZERO @@ -225,8 +225,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, - $ LWORK_SGELS, LWORK_SGETSLS, LWORK_SGELSS, - $ LWORK_SGELSY, LWORK_SGELSD + $ LWORK_SGELS, LWORK_SGELST, LWORK_SGETSLS, + $ LWORK_SGELSS, LWORK_SGELSY, LWORK_SGELSD REAL EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. @@ -243,12 +243,12 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, SAXPY, SERRLS, SGELS, - $ SGELSD, SGELSS, SGELSY, SGEMM, SLACPY, - $ SLARNV, SQRT13, SQRT15, SQRT16, SSCAL, - $ XLAENV, SGETSLS + $ SGELSD, SGELSS, SGELST, SGELSY, SGEMM, + $ SGETSLS, SLACPY, SLARNV, SQRT13, SQRT15, + $ SQRT16, SSCAL, XLAENV * .. * .. Intrinsic Functions .. - INTRINSIC INT, LOG, MAX, MIN, REAL, SQRT + INTRINSIC INT, MAX, MIN, REAL, SQRT * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -330,7 +330,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -357,6 +358,10 @@ CALL SGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ( 1 ), -1, INFO ) LWORK_SGELS = INT ( WQ( 1 ) ) +* Compute workspace needed for SGELST + CALL SGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_SGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for SGETSLS CALL SGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ( 1 ), -1, INFO ) @@ -378,9 +383,9 @@ * Compute LIWORK workspace needed for SGELSY and SGELSD LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGETSLS, - $ LWORK_SGELSY, LWORK_SGELSS, - $ LWORK_SGELSD ) + LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGELST, + $ LWORK_SGETSLS, LWORK_SGELSY, + $ LWORK_SGELSS, LWORK_SGELSD ) END IF ENDDO ENDDO @@ -411,21 +416,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 110 -* +* ===================================================== +* Begin test SGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test SGELS -* * Generate a matrix of scaling type ISCALE * CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -469,20 +479,27 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for SGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL SQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for SGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * RESULT( 2 ) = SQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, @@ -500,7 +517,7 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -509,26 +526,33 @@ $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test SGETSLS + END DO + END DO + END IF +* ===================================================== +* End test SGELS +* ===================================================== +* ===================================================== +* Begin test SGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) -* - DO 60 ITRAN = 1, 2 +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -563,31 +587,38 @@ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'SGETSLS ' - CALL SGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'SGELST' + CALL SGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'SGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for SGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL SQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, - $ RESULT( 15 ) ) + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for SGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * - RESULT( 16 ) = SQRT17( TRANS, 1, M, N, + RESULT( 4 ) = SQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -595,7 +626,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = SQRT14( TRANS, M, N, + RESULT( 4 ) = SQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -603,21 +634,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 ) TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO END IF +* ===================================================== +* End test SGELST +* ===================================================== +* ===================================================== +* Begin test SGETSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO IMB = 1, NNB + MB = NBVAL( IMB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'T' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL SLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL SSCAL( NCOLS*NRHS, + $ ONE / REAL( NCOLS ), + $ WORK, 1 ) + END IF + CALL SGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, ONE, + $ COPYA, LDA, WORK, LDWORK, + $ ZERO, B, LDB ) + CALL SLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL SLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL SLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'SGETSLS' + CALL SGETSLS( TRANS, M, N, NRHS, + $ A, LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SGETSLS', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for SGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL SLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL SQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for SGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = SQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = SQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, + $ B, LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 ) TRANS, + $ M, N, NRHS, MB, NB, ITYPE, + $ K, RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO + END IF +* ===================================================== +* End test SGETSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -662,37 +823,37 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = SQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = SQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 4 ) ) + $ WORK( M*NRHS+1 ), RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -716,38 +877,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 8 ) ) + $ WORK( M*NRHS+1 ), RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -776,45 +937,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 12 ) ) + $ WORK( M*NRHS+1 ), RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 90 K = 3, 14 + DO 90 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/serrls.f b/lapack-netlib/TESTING/LIN/serrls.f index e6ee4360f..6c4820066 100644 --- a/lapack-netlib/TESTING/LIN/serrls.f +++ b/lapack-netlib/TESTING/LIN/serrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> SERRLS tests the error exits for the REAL least squares -*> driver routines (SGELS, SGELSS, SGELSY, SGELSD). +*> driver routines (SGELS, SGELST, SGETSLS, SGELSS, SGELSY, SGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELSY + EXTERNAL ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELST, + $ SGELSY, SGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL SGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK ) * +* SGELST +* + SRNAMT = 'SGELST' + INFOT = 1 + CALL SGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) +* +* SGETSLS +* + SRNAMT = 'SGETSLS' + INFOT = 1 + CALL SGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) +* * SGELSS * SRNAMT = 'SGELSS' diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 2eab97905..b21345d30 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> ZDRVLS tests the least squares driver routines ZGELS, ZGETSLS, ZGELSS, ZGELSY -*> and ZGELSD. +*> ZDRVLS tests the least squares driver routines ZGELS, ZGELST, +*> ZGETSLS, ZGELSS, ZGELSY and ZGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) DOUBLE PRECISION ONE, ZERO @@ -228,8 +228,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, LRWORK, - $ LWORK_ZGELS, LWORK_ZGETSLS, LWORK_ZGELSS, - $ LWORK_ZGELSY, LWORK_ZGELSD, + $ LWORK_ZGELS, LWORK_ZGELST, LWORK_ZGETSLS, + $ LWORK_ZGELSS, LWORK_ZGELSY, LWORK_ZGELSD, $ LRWORK_ZGELSY, LRWORK_ZGELSS, LRWORK_ZGELSD DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. @@ -248,10 +248,10 @@ EXTERNAL DASUM, DLAMCH, ZQRT12, ZQRT14, ZQRT17 * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, DLASRT, XLAENV, - $ ZDSCAL, ZERRLS, ZGELS, ZGELSD, ZGELSS, - $ ZGELSY, ZGEMM, ZLACPY, ZLARNV, ZQRT13, ZQRT15, - $ ZQRT16, ZGETSLS + EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, ZERRLS, ZGELS, + $ ZGELSD, ZGELSS, ZGELST, ZGELSY, ZGEMM, + $ ZGETSLS, ZLACPY, ZLARNV, ZQRT13, ZQRT15, + $ ZQRT16, ZDSCAL, XLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, INT, SQRT @@ -334,7 +334,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -361,6 +362,10 @@ CALL ZGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_ZGELS = INT ( WQ( 1 ) ) +* Compute workspace needed for ZGELST + CALL ZGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_ZGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for ZGETSLS CALL ZGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -390,9 +395,9 @@ LRWORK = MAX( LRWORK, LRWORK_ZGELSY, $ LRWORK_ZGELSS, LRWORK_ZGELSD ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGETSLS, - $ LWORK_ZGELSY, LWORK_ZGELSS, - $ LWORK_ZGELSD ) + LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGELST, + $ LWORK_ZGETSLS, LWORK_ZGELSY, + $ LWORK_ZGELSS, LWORK_ZGELSD ) END IF ENDDO ENDDO @@ -425,21 +430,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 100 -* +* ===================================================== +* Begin test ZGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test ZGELS -* * Generate a matrix of scaling type ISCALE * CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -484,15 +494,20 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for ZGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, RWORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for ZGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN @@ -515,7 +530,7 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -524,26 +539,34 @@ $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test ZGETSLS + END DO + END DO + END IF +* ===================================================== +* End test ZGELS +* ===================================================== +* ===================================================== +* Begin test ZGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + CALL XLAENV( 3, NXVAL( INB ) ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -560,9 +583,9 @@ IF( NCOLS.GT.0 ) THEN CALL ZLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) - CALL ZSCAL( NCOLS*NRHS, - $ CONE / DBLE( NCOLS ), WORK, - $ 1 ) + CALL ZDSCAL( NCOLS*NRHS, + $ ONE / DBLE( NCOLS ), WORK, + $ 1 ) END IF CALL ZGEMM( TRANS, 'No transpose', NROWS, $ NRHS, NCOLS, CONE, COPYA, LDA, @@ -578,31 +601,37 @@ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'ZGETSLS ' - CALL ZGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'ZGELST' + CALL ZGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) +* IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'ZGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for ZGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK2, - $ RESULT( 15 ) ) + $ LDA, B, LDB, C, LDB, RWORK, + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for ZGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * * Solving LS system * - RESULT( 16 ) = ZQRT17( TRANS, 1, M, N, + RESULT( 4 ) = ZQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -610,7 +639,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = ZQRT14( TRANS, M, N, + RESULT( 4 ) = ZQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -618,21 +647,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 )TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test ZGELST +* ===================================================== +* ===================================================== +* Begin test ZGELSTSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO INB = 1, NNB + MB = NBVAL( INB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO IMB = 1, NNB + NB = NBVAL( IMB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'C' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL ZLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL ZSCAL( NCOLS*NRHS, + $ CONE / DBLE( NCOLS ), + $ WORK, 1 ) + END IF + CALL ZGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, CONE, + $ COPYA, LDA, WORK, LDWORK, + $ CZERO, B, LDB ) + CALL ZLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL ZLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL ZLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'ZGETSLS ' + CALL ZGETSLS( TRANS, M, N, NRHS, A, + $ LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZGETSLS ', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for ZGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL ZLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL ZQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK2, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for ZGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = ZQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = ZQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, B, + $ LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 )TRANS, + $ M, N, NRHS, MB, NB, ITYPE, K, + $ RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test ZGELSTSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -680,37 +839,37 @@ * * workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = ZQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = ZQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK, RWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 4 ) ) + $ RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -736,38 +895,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 8 ) ) + $ RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -792,45 +951,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 12 ) ) + $ RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 80 K = 3, 14 + DO 80 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/zerrls.f b/lapack-netlib/TESTING/LIN/zerrls.f index 66e56c8c6..22f049ee0 100644 --- a/lapack-netlib/TESTING/LIN/zerrls.f +++ b/lapack-netlib/TESTING/LIN/zerrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> ZERRLS tests the error exits for the COMPLEX*16 least squares -*> driver routines (ZGELS, CGELSS, CGELSY, CGELSD). +*> driver routines (ZGELS, ZGELST, ZGETSLS, CGELSS, CGELSY, CGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELSY + EXTERNAL ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELST, + $ ZGELSY, ZGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL ZGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK ) * +* ZGELST +* + SRNAMT = 'ZGELST' + INFOT = 1 + CALL ZGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) +* +* ZGETSLS +* + SRNAMT = 'ZGETSLS' + INFOT = 1 + CALL ZGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) +* * ZGELSS * SRNAMT = 'ZGELSS' From 1d32ce51359145d94eb6d592f8f5d43437f1a9f0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 22:42:50 +0100 Subject: [PATCH 116/154] Add ?GELST (Reference-LAPACK PR739) --- lapack-netlib/SRC/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 49eb69cfe..49798b0c5 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o slarmm.o slatrs3.o strsyl3.o + sgesvdq.o slarmm.o slatrs3.o strsyl3.o sgelst.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o clatrs3.o ctrsyl3.o + cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o endif ifdef USEXBLAS @@ -417,7 +417,7 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o + dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o dgelst.o endif ifdef USEXBLAS @@ -526,7 +526,7 @@ ZLASRC_O = \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ - zgesvdq.o zlatrs3.o ztrsyl3.o + zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o endif ifdef USEXBLAS From 5ff46f40921b287c3f34d86770de56413f214680 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 22:49:31 +0100 Subject: [PATCH 117/154] Add ?GELST (Reference-LAPACK PR739) --- cmake/lapack.cmake | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index ca3a1e184..8a5ff22ec 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -124,7 +124,7 @@ set(SLASRC ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f - slarmm.f slatrs3.f strsyl3.f) + slarmm.f slatrs3.f strsyl3.f sgelst.f) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -223,7 +223,7 @@ set(CLASRC chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f cungtsqr.f cungtsqr_row.f cunhr_col.f - clatrs3.f ctrsyl3.f ) + clatrs3.f ctrsyl3.f cgelst.f) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -316,7 +316,7 @@ set(DLASRC dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f - dlarmm.f dlatrs3.f dtrsyl3.f) + dlarmm.f dlatrs3.f dtrsyl3.f dgelst.f) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -419,7 +419,7 @@ set(ZLASRC zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f zungtsqr.f zungtsqr_row.f zunhr_col.f - zlatrs3.f ztrsyl3.f) + zlatrs3.f ztrsyl3.f zgelst.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f @@ -622,7 +622,7 @@ set(SLASRC ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c sgesvdq.c slaorhr_col_getrfnp.c slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c - slarmm.c slatrs3.c strsyl3.c) + slarmm.c slatrs3.c strsyl3.c sgelst.c) set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c @@ -638,7 +638,7 @@ set(CLASRC cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c cgehd2.c cgehrd.c cgelq2.c cgelqf.c cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c - cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c + cgeqr2.c cgeqr2p.c cgeqrf.c fcgeqrfp.c cgerfs.c cgerq2.c cgerqf.c cgesc2.c cgesdd.c cgesvd.c cgesvdx.c cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c cgesvx.c cgetc2.c cgetrf2.c @@ -720,7 +720,7 @@ set(CLASRC chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c cungtsqr.c cungtsqr_row.c cunhr_col.c - clatrs3.c ctrsyl3.c) + clatrs3.c ctrsyl3.c cgelst.c) set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c @@ -812,7 +812,7 @@ set(DLASRC dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c - dlarmm.c dlatrs3.c dtrsyl3.c) + dlarmm.c dlatrs3.c dtrsyl3.c dgelst.c) set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c @@ -913,7 +913,7 @@ set(ZLASRC zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c) + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c) set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c From f157d6d6718493215ae9ab915a9202a1018bbaf0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 22:50:57 +0100 Subject: [PATCH 118/154] Add C equivalents of ?GELST (for Reference-LAPACK PR739) --- lapack-netlib/SRC/cgelst.c | 1108 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgelst.c | 1104 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgelst.c | 1099 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgelst.c | 1115 ++++++++++++++++++++++++++++++++++++ 4 files changed, 4426 insertions(+) create mode 100644 lapack-netlib/SRC/cgelst.c create mode 100644 lapack-netlib/SRC/dgelst.c create mode 100644 lapack-netlib/SRC/sgelst.c create mode 100644 lapack-netlib/SRC/zgelst.c diff --git a/lapack-netlib/SRC/cgelst.c b/lapack-netlib/SRC/cgelst.c new file mode 100644 index 000000000..48ded643d --- /dev/null +++ b/lapack-netlib/SRC/cgelst.c @@ -0,0 +1,1108 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download CGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */ +/* > or LQ factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'C' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'C' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'C': the linear system involves A**H. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by CGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by CGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'C'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > modulus of elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of the modulus of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complexGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int cgelst_(char *trans, integer *m, integer *n, integer * + nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; + real r__1; + + /* Local variables */ + real anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + real rwork[1]; + integer lwopt, nb; + extern /* Subroutine */ int slabad_(real *, real *); + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer mn; + extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + extern real slamch_(char *); + extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *), xerbla_(char *, + integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern /* Subroutine */ int cgelqt_(integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, integer *); + integer scllen; + real bignum; + extern /* Subroutine */ int cgeqrt_(integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, integer *); + integer mnnrhs; + real smlnum; + logical lquery; + extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *), cgemlqt_(char *, char *, integer *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *), cgemqrt_(char *, char *, integer *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "CGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "CGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = slamch_("S") / slamch_("P"); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = clange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = clange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + clascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + clascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + cgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + ctrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + ctrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0.f, b[i__3].i = 0.f; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + cgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + ctrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0.f, b[i__3].i = 0.f; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + ctrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + clascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + clascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + clascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + clascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + + return 0; + +/* End of CGELST */ + +} /* cgelst_ */ + diff --git a/lapack-netlib/SRC/dgelst.c b/lapack-netlib/SRC/dgelst.c new file mode 100644 index 000000000..9327da4dd --- /dev/null +++ b/lapack-netlib/SRC/dgelst.c @@ -0,0 +1,1104 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download DGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */ +/* > factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'T' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'T' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'T': the linear system involves A**T. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by DGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by DGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'T'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int dgelst_(char *trans, integer *m, integer *n, integer * + nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; + + /* Local variables */ + doublereal anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + doublereal rwork[1]; + integer lwopt; + extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + integer nb; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer mn; + extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *), dlaset_(char *, integer *, integer + *, doublereal *, doublereal *, doublereal *, integer *), + xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + doublereal bignum; + extern /* Subroutine */ int dgelqt_(integer *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dgeqrt_(integer *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *, doublereal *, integer *); + integer mnnrhs; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *), dgemlqt_(char *, char *, + integer *, integer *, integer *, integer *, doublereal *, integer + *, doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dgemqrt_(char *, char *, integer *, + integer *, integer *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "T"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "DGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + work[1] = (doublereal) lwopt; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (doublereal) lwopt; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "DGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = dlamch_("S") / dlamch_("P"); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = dlange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0. && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (doublereal) lwopt; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = dlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0. && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + dlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + dlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + dgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + dtrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + dtrtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + dgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + dtrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + dtrtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + dlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + dlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + dlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + dlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + work[1] = (doublereal) lwopt; + + return 0; + +/* End of DGELST */ + +} /* dgelst_ */ + diff --git a/lapack-netlib/SRC/sgelst.c b/lapack-netlib/SRC/sgelst.c new file mode 100644 index 000000000..e0cd84cd9 --- /dev/null +++ b/lapack-netlib/SRC/sgelst.c @@ -0,0 +1,1099 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download SGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */ +/* > factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'T' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'T' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'T': the linear system involves A**T. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by SGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by SGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'T'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup realGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int sgelst_(char *trans, integer *m, integer *n, integer * + nrhs, real *a, integer *lda, real *b, integer *ldb, real *work, + integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; + + /* Local variables */ + real anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + real rwork[1]; + integer lwopt, nb; + extern /* Subroutine */ int slabad_(real *, real *); + integer mn; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + real bignum; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, + real *, integer *), sgelqt_(integer *, integer *, integer + *, real *, integer *, real *, integer *, real *, integer *); + integer mnnrhs; + extern /* Subroutine */ int sgeqrt_(integer *, integer *, integer *, real + *, integer *, real *, integer *, real *, integer *); + real smlnum; + logical lquery; + extern /* Subroutine */ int strtrs_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *), sgemlqt_(char *, char *, integer *, + integer *, integer *, integer *, real *, integer *, real *, + integer *, real *, integer *, real *, integer *), + sgemqrt_(char *, char *, integer *, integer *, integer *, integer + *, real *, integer *, real *, integer *, real *, integer *, real * + , integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "T"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "SGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + work[1] = (real) lwopt; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (real) lwopt; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = slamch_("S") / slamch_("P"); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = slange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (real) lwopt; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = slange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + slascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + slascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + sgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + strtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + strtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.f; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + sgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + strtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.f; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + strtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + slascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + slascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + slascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + slascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + work[1] = (real) lwopt; + + return 0; + +/* End of SGELST */ + +} /* sgelst_ */ + diff --git a/lapack-netlib/SRC/zgelst.c b/lapack-netlib/SRC/zgelst.c new file mode 100644 index 000000000..447cd30bb --- /dev/null +++ b/lapack-netlib/SRC/zgelst.c @@ -0,0 +1,1115 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download ZGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */ +/* > or LQ factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'C' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'C' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'C': the linear system involves A**H. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by ZGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by ZGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'C'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > modulus of elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of the modulus of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16GEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int zgelst_(char *trans, integer *m, integer *n, integer * + nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, + doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; + doublereal d__1; + + /* Local variables */ + doublereal anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + doublereal rwork[1]; + integer lwopt; + extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + integer nb; + extern doublereal dlamch_(char *); + integer mn; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + doublereal bignum; + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex *, + integer *, integer *), zlaset_(char *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *); + integer mnnrhs; + extern /* Subroutine */ int zgelqt_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + doublereal smlnum; + extern /* Subroutine */ int zgeqrt_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + logical lquery; + extern /* Subroutine */ int ztrtrs_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + integer *), zgemlqt_(char *, char *, + integer *, integer *, integer *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zgemqrt_(char *, + char *, integer *, integer *, integer *, integer *, doublecomplex + *, integer *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "ZGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "ZGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = dlamch_("S") / dlamch_("P"); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = zlange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0. && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = zlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0. && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + zlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + zlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + zgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + ztrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + ztrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0., b[i__3].i = 0.; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + zgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + ztrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0., b[i__3].i = 0.; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + ztrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + zlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + zlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + zlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + zlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + + return 0; + +/* End of ZGELST */ + +} /* zgelst_ */ + From 88cd91c4902cd40978420df9db123b392d771ad7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Nov 2022 23:15:20 +0100 Subject: [PATCH 119/154] Fix stray character --- cmake/lapack.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 8a5ff22ec..17ff8d0a0 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -638,7 +638,7 @@ set(CLASRC cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c cgehd2.c cgehrd.c cgelq2.c cgelqf.c cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c - cgeqr2.c cgeqr2p.c cgeqrf.c fcgeqrfp.c cgerfs.c cgerq2.c cgerqf.c + cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c cgesc2.c cgesdd.c cgesvd.c cgesvdx.c cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c cgesvx.c cgetc2.c cgetrf2.c From eea1636380fe6b8462e2ae73cc0e6c3c1aa0e3ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 13:22:55 +0100 Subject: [PATCH 120/154] Use normwise criterion for INF eigenvalues in QZ (Reference-LAPACK PR698) --- lapack-netlib/SRC/chgeqz.f | 9 ++------- lapack-netlib/SRC/dhgeqz.f | 9 ++------- lapack-netlib/SRC/shgeqz.f | 9 ++------- lapack-netlib/SRC/zhgeqz.f | 9 ++------- 4 files changed, 8 insertions(+), 28 deletions(-) diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 8c1d62a87..50c6827ff 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -523,9 +523,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = CZERO GO TO 50 END IF @@ -551,10 +549,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = CZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/dhgeqz.f b/lapack-netlib/SRC/dhgeqz.f index 3fe2a083c..b5a2917e3 100644 --- a/lapack-netlib/SRC/dhgeqz.f +++ b/lapack-netlib/SRC/dhgeqz.f @@ -536,9 +536,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = ZERO GO TO 70 END IF @@ -564,10 +562,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = ZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f index 79a9c6092..10fb2b7d7 100644 --- a/lapack-netlib/SRC/shgeqz.f +++ b/lapack-netlib/SRC/shgeqz.f @@ -536,9 +536,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = ZERO GO TO 70 END IF @@ -564,10 +562,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = ZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index 302b69f34..c15e7aace 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -524,9 +524,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = CZERO GO TO 50 END IF @@ -552,10 +550,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = CZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A From 60af35bfab111416f78db3a7797f2134c7f23ea0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 13:25:21 +0100 Subject: [PATCH 121/154] Fix workspace query for ?SYEVD and ?HEEVD (Reference-LAPACK PR691) --- lapack-netlib/SRC/cheevd.f | 2 +- lapack-netlib/SRC/dsyevd.f | 2 +- lapack-netlib/SRC/ssyevd.f | 2 +- lapack-netlib/SRC/zheevd.f | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index 9a4a1efb7..2ddf74b98 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -284,7 +284,7 @@ LIWMIN = 1 END IF LOPT = MAX( LWMIN, N + - $ ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) ) LROPT = LRWMIN LIOPT = LIWMIN END IF diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index edbe896fe..eaaecd8d9 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -257,7 +257,7 @@ LWMIN = 2*N + 1 END IF LOPT = MAX( LWMIN, 2*N + - $ ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF WORK( 1 ) = LOPT diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index 8b90d9263..ac0d0284d 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -255,7 +255,7 @@ LWMIN = 2*N + 1 END IF LOPT = MAX( LWMIN, 2*N + - $ ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF WORK( 1 ) = LOPT diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index a6484eb03..7f58c7f72 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -284,7 +284,7 @@ LIWMIN = 1 END IF LOPT = MAX( LWMIN, N + - $ ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) ) LROPT = LRWMIN LIOPT = LIWMIN END IF From 3f31b691211a772b61c0e016961e9c0d8f05a02e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 13:30:25 +0100 Subject: [PATCH 122/154] Add quick return if scaling with one (Reference-LAPACK PR674) --- lapack-netlib/SRC/clascl.f | 2 ++ lapack-netlib/SRC/dlascl.f | 2 ++ lapack-netlib/SRC/slascl.f | 2 ++ lapack-netlib/SRC/zlascl.f | 2 ++ 4 files changed, 8 insertions(+) diff --git a/lapack-netlib/SRC/clascl.f b/lapack-netlib/SRC/clascl.f index 399af23a4..f9aace0bc 100644 --- a/lapack-netlib/SRC/clascl.f +++ b/lapack-netlib/SRC/clascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/dlascl.f b/lapack-netlib/SRC/dlascl.f index 05ad1c4f3..0a4bf21ce 100644 --- a/lapack-netlib/SRC/dlascl.f +++ b/lapack-netlib/SRC/dlascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/slascl.f b/lapack-netlib/SRC/slascl.f index e1cb420ea..28cbd6514 100644 --- a/lapack-netlib/SRC/slascl.f +++ b/lapack-netlib/SRC/slascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/zlascl.f b/lapack-netlib/SRC/zlascl.f index 3d53f5ae6..4cce5ff5e 100644 --- a/lapack-netlib/SRC/zlascl.f +++ b/lapack-netlib/SRC/zlascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * From 9e29312c8311efb22998029b330c3cba4c04b5da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 16:34:45 +0100 Subject: [PATCH 123/154] Fix type precision and function documentation (Reference-LAPACK PRs 647+702) --- lapack-netlib/SRC/dorbdb6.f | 84 +++++++++++++++++++++---------------- lapack-netlib/SRC/zunbdb6.f | 84 +++++++++++++++++++++---------------- 2 files changed, 94 insertions(+), 74 deletions(-) diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f index fac52f760..45c8ba8a2 100644 --- a/lapack-netlib/SRC/dorbdb6.f +++ b/lapack-netlib/SRC/dorbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,15 +173,18 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01D0, REALONE = 1.0D0, + DOUBLE PRECISION ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) DOUBLE PRECISION NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 ) * .. * .. Local Scalars .. - INTEGER I - DOUBLE PRECISION NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + DOUBLE PRECISION EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH * .. * .. External Subroutines .. EXTERNAL DGEMV, DLASSQ, XERBLA @@ -210,17 +219,17 @@ CALL XERBLA( 'DORBDB6', -INFO ) RETURN END IF +* + EPS = DLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -238,27 +247,31 @@ CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -280,24 +293,22 @@ CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -306,4 +317,3 @@ * End of DORBDB6 * END - diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f index ec681b597..ed666e449 100644 --- a/lapack-netlib/SRC/zunbdb6.f +++ b/lapack-netlib/SRC/zunbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,16 +173,19 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01D0, REALONE = 1.0D0, + DOUBLE PRECISION ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) COMPLEX*16 NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0), $ ZERO = (0.0D0,0.0D0) ) * .. * .. Local Scalars .. - INTEGER I - DOUBLE PRECISION NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + DOUBLE PRECISION EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH * .. * .. External Subroutines .. EXTERNAL ZGEMV, ZLASSQ, XERBLA @@ -211,17 +220,17 @@ CALL XERBLA( 'ZUNBDB6', -INFO ) RETURN END IF +* + EPS = DLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -239,27 +248,31 @@ CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -281,24 +294,22 @@ CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -307,4 +318,3 @@ * End of ZUNBDB6 * END - From b9468205021146e1f45f9c91e1bda9699ef68bf4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 16:36:19 +0100 Subject: [PATCH 124/154] Fix uninitialized variable (Reference-LAPACK PR647) --- lapack-netlib/SRC/cunbdb6.f | 84 +++++++++++++++++++++---------------- lapack-netlib/SRC/sorbdb6.f | 84 +++++++++++++++++++++---------------- 2 files changed, 94 insertions(+), 74 deletions(-) diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f index 7acc99cb8..b93a389d6 100644 --- a/lapack-netlib/SRC/cunbdb6.f +++ b/lapack-netlib/SRC/cunbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,16 +173,19 @@ * ===================================================================== * * .. Parameters .. - REAL ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01E0, REALONE = 1.0E0, + REAL ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) COMPLEX NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0), $ ZERO = (0.0E0,0.0E0) ) * .. * .. Local Scalars .. - INTEGER I - REAL NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + REAL EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + REAL SLAMCH * .. * .. External Subroutines .. EXTERNAL CGEMV, CLASSQ, XERBLA @@ -211,17 +220,17 @@ CALL XERBLA( 'CUNBDB6', -INFO ) RETURN END IF +* + EPS = SLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -239,27 +248,31 @@ CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -281,24 +294,22 @@ CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -307,4 +318,3 @@ * End of CUNBDB6 * END - diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f index a23b42beb..b2449e3be 100644 --- a/lapack-netlib/SRC/sorbdb6.f +++ b/lapack-netlib/SRC/sorbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,15 +173,18 @@ * ===================================================================== * * .. Parameters .. - REAL ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01E0, REALONE = 1.0E0, + REAL ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) REAL NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 ) * .. * .. Local Scalars .. - INTEGER I - REAL NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + REAL EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + REAL SLAMCH * .. * .. External Subroutines .. EXTERNAL SGEMV, SLASSQ, XERBLA @@ -210,17 +219,17 @@ CALL XERBLA( 'SORBDB6', -INFO ) RETURN END IF +* + EPS = SLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -238,27 +247,31 @@ CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -280,24 +293,22 @@ CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -306,4 +317,3 @@ * End of SORBDB6 * END - From aaea0804bcb0318e96d3fcfd32dc66570c633c4a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 16:38:57 +0100 Subject: [PATCH 125/154] Fix function documentation (Reference-LAPACK PR697) --- lapack-netlib/SRC/cunbdb2.f | 4 ++-- lapack-netlib/SRC/cunbdb4.f | 4 ++-- lapack-netlib/SRC/dorbdb2.f | 4 ++-- lapack-netlib/SRC/dorbdb4.f | 4 ++-- lapack-netlib/SRC/sorbdb2.f | 4 ++-- lapack-netlib/SRC/sorbdb4.f | 4 ++-- lapack-netlib/SRC/zunbdb2.f | 4 ++-- lapack-netlib/SRC/zunbdb4.f | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f index db238f925..b45db6100 100644 --- a/lapack-netlib/SRC/cunbdb2.f +++ b/lapack-netlib/SRC/cunbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX array, dimension (P) +*> TAUP1 is COMPLEX array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX array, dimension (M-P) +*> TAUP2 is COMPLEX array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f index e6afd89c3..117f23d08 100644 --- a/lapack-netlib/SRC/cunbdb4.f +++ b/lapack-netlib/SRC/cunbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX array, dimension (P) +*> TAUP1 is COMPLEX array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX array, dimension (M-P) +*> TAUP2 is COMPLEX array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f index 64e4645bc..a0dacbb16 100644 --- a/lapack-netlib/SRC/dorbdb2.f +++ b/lapack-netlib/SRC/dorbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is DOUBLE PRECISION array, dimension (P) +*> TAUP1 is DOUBLE PRECISION array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is DOUBLE PRECISION array, dimension (M-P) +*> TAUP2 is DOUBLE PRECISION array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f index a09568415..08604be45 100644 --- a/lapack-netlib/SRC/dorbdb4.f +++ b/lapack-netlib/SRC/dorbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is DOUBLE PRECISION array, dimension (P) +*> TAUP1 is DOUBLE PRECISION array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is DOUBLE PRECISION array, dimension (M-P) +*> TAUP2 is DOUBLE PRECISION array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f index ad3eb269d..484d352f8 100644 --- a/lapack-netlib/SRC/sorbdb2.f +++ b/lapack-netlib/SRC/sorbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is REAL array, dimension (P) +*> TAUP1 is REAL array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is REAL array, dimension (M-P) +*> TAUP2 is REAL array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f index b18ed3b27..bf60fb7bb 100644 --- a/lapack-netlib/SRC/sorbdb4.f +++ b/lapack-netlib/SRC/sorbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is REAL array, dimension (P) +*> TAUP1 is REAL array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is REAL array, dimension (M-P) +*> TAUP2 is REAL array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f index 412d8d8d0..46b08aa1e 100644 --- a/lapack-netlib/SRC/zunbdb2.f +++ b/lapack-netlib/SRC/zunbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX*16 array, dimension (P) +*> TAUP1 is COMPLEX*16 array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX*16 array, dimension (M-P) +*> TAUP2 is COMPLEX*16 array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f index b1fcd8bd0..4672cfa67 100644 --- a/lapack-netlib/SRC/zunbdb4.f +++ b/lapack-netlib/SRC/zunbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX*16 array, dimension (P) +*> TAUP1 is COMPLEX*16 array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX*16 array, dimension (M-P) +*> TAUP2 is COMPLEX*16 array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim From 6f09e4c1212db6f33407efd8bb4588335c626fee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 19:37:28 +0100 Subject: [PATCH 126/154] Improve FMA usage in ?LAQR5 (Reference-LAPACK PR681) --- lapack-netlib/SRC/claqr5.f | 96 ++++++++++++++++++---------------- lapack-netlib/SRC/dlaqr5.f | 102 +++++++++++++++++++++---------------- lapack-netlib/SRC/slaqr5.f | 102 +++++++++++++++++++++---------------- lapack-netlib/SRC/zlaqr5.f | 97 +++++++++++++++++++---------------- 4 files changed, 223 insertions(+), 174 deletions(-) diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index 95cc33b9d..0a01cc226 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -279,7 +279,7 @@ PARAMETER ( RZERO = 0.0e0, RONE = 1.0e0 ) * .. * .. Local Scalars .. - COMPLEX ALPHA, BETA, CDUM, REFSUM + COMPLEX ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*CONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ ELSE JBOT = KBOT END IF + T1 = CONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ CONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -610,25 +611,28 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + CONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +692,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )* - $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )* + $ H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +718,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +737,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index 0c63ab800..43b4ac72a 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -286,8 +286,8 @@ * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -631,22 +635,25 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +713,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +739,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +758,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index b9bae9376..a4f805674 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -286,8 +286,8 @@ * .. * .. Local Scalars .. REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -631,22 +635,25 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +713,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +739,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +758,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 3185508bc..4fa5ee5b0 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -279,7 +279,7 @@ PARAMETER ( RZERO = 0.0d0, RONE = 1.0d0 ) * .. * .. Local Scalars .. - COMPLEX*16 ALPHA, BETA, CDUM, REFSUM + COMPLEX*16 ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*DCONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ ELSE JBOT = KBOT END IF + T1 = DCONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ DCONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -610,25 +611,29 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + $ + DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + DCONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +693,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + DCONJG( V( 2, M ) )*H( K+2, J ) + $ + DCONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +719,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +738,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF From c6816bb5760827fb073fd65db49ee2178933e20d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 19:39:12 +0100 Subject: [PATCH 127/154] Use normwise criterion in multishift QZ (Reference-LAPACK PR698) --- lapack-netlib/SRC/claqz0.f | 16 ++++++---------- lapack-netlib/SRC/dlaqz0.f | 16 ++++++---------- lapack-netlib/SRC/slaqz0.f | 17 +++++++---------- lapack-netlib/SRC/zlaqz0.f | 17 +++++++---------- 4 files changed, 26 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f index 2284fd65d..9cc25c6dc 100644 --- a/lapack-netlib/SRC/claqz0.f +++ b/lapack-netlib/SRC/claqz0.f @@ -299,7 +299,7 @@ PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, BNORM, BTOL COMPLEX :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -312,7 +312,7 @@ * External Functions EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD, $ CLARTG, CROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, CLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -466,6 +466,9 @@ ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = CLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -528,15 +531,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f index 1bf65fd60..5b0965406 100644 --- a/lapack-netlib/SRC/dlaqz0.f +++ b/lapack-netlib/SRC/dlaqz0.f @@ -322,7 +322,7 @@ * Local scalars DOUBLE PRECISION :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, - $ TEMP, SWAP + $ TEMP, SWAP, BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -334,7 +334,7 @@ * External Functions EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD, $ DLARTG, DROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -486,6 +486,9 @@ ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = DLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -562,15 +565,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 15913be88..69f402914 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -318,7 +318,8 @@ PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP + REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP, + $ BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -330,7 +331,7 @@ * External Functions EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD, $ SLARTG, SROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, SLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -482,6 +483,9 @@ ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = SLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -558,15 +562,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f index 2616f20b5..0d8884ed5 100644 --- a/lapack-netlib/SRC/zlaqz0.f +++ b/lapack-netlib/SRC/zlaqz0.f @@ -300,7 +300,8 @@ PARAMETER( ZERO = 0.0D0, ONE = 1.0D0, HALF = 0.5D0 ) * Local scalars - DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, + $ BNORM, BTOL COMPLEX*16 :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -313,7 +314,7 @@ * External Functions EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD, $ ZLARTG, ZROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -467,6 +468,9 @@ ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = ZLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -529,15 +533,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it From 31d2145988b0b952f702d3fdbeb910f6ff8e1489 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 22:44:36 +0100 Subject: [PATCH 128/154] Set scale early for robust triangular solvers (Reference-LAPACK PR712) --- lapack-netlib/SRC/clatbs.f | 9 ++--- lapack-netlib/SRC/clatrs.f | 79 ++++++++++++++++++++++++++++++++++---- lapack-netlib/SRC/dlatbs.f | 2 +- lapack-netlib/SRC/dlatrs.f | 69 ++++++++++++++++++++++++++++++--- lapack-netlib/SRC/slatbs.f | 2 +- lapack-netlib/SRC/slatrs.f | 69 ++++++++++++++++++++++++++++++--- lapack-netlib/SRC/zlatbs.f | 9 ++--- lapack-netlib/SRC/zlatrs.f | 79 ++++++++++++++++++++++++++++++++++---- 8 files changed, 278 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/clatbs.f b/lapack-netlib/SRC/clatbs.f index 606f963d3..97abcadce 100644 --- a/lapack-netlib/SRC/clatbs.f +++ b/lapack-netlib/SRC/clatbs.f @@ -278,7 +278,7 @@ $ CDOTU, CLADIV * .. * .. External Subroutines .. - EXTERNAL CAXPY, CSSCAL, CTBSV, SLABAD, SSCAL, XERBLA + EXTERNAL CAXPY, CSSCAL, CTBSV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -324,17 +324,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = SLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / SLAMCH( 'Precision' ) + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/clatrs.f b/lapack-netlib/SRC/clatrs.f index 946ab8068..91334b706 100644 --- a/lapack-netlib/SRC/clatrs.f +++ b/lapack-netlib/SRC/clatrs.f @@ -274,7 +274,7 @@ $ CDOTU, CLADIV * .. * .. External Subroutines .. - EXTERNAL CAXPY, CSSCAL, CTRSV, SLABAD, SSCAL, XERBLA + EXTERNAL CAXPY, CSSCAL, CTRSV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -318,17 +318,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = SLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / SLAMCH( 'Precision' ) + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -360,8 +357,74 @@ IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE - TSCAL = HALF / ( SMLNUM*TMAX ) - CALL SSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.SLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = HALF / ( SMLNUM*TMAX ) + CALL SSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be +* represented as a floating-point number. Find the +* maximum offdiagonal absolute value +* max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is +* not +/- Infinity, use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + DO I = 1, J - 1 + TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ), + $ ABS( AIMAG(A ( I, J ) ) ) ) + END DO + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + DO I = J + 1, N + TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ), + $ ABS( AIMAG(A ( I, J ) ) ) ) + END DO + END DO + END IF +* + IF( TMAX.LE.SLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm of each column without +* introducing Infinity in the summation. + TSCAL = TWO * TSCAL + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + END IF + TSCAL = TSCAL * HALF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point +* entry. Rely on TRSV to propagate Inf and NaN. + CALL CTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/dlatbs.f b/lapack-netlib/SRC/dlatbs.f index 4b71d5399..6a812743b 100644 --- a/lapack-netlib/SRC/dlatbs.f +++ b/lapack-netlib/SRC/dlatbs.f @@ -310,6 +310,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -317,7 +318,6 @@ * SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f index 43f92911d..be156bee2 100644 --- a/lapack-netlib/SRC/dlatrs.f +++ b/lapack-netlib/SRC/dlatrs.f @@ -264,8 +264,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX - DOUBLE PRECISION DASUM, DDOT, DLAMCH - EXTERNAL LSAME, IDAMAX, DASUM, DDOT, DLAMCH + DOUBLE PRECISION DASUM, DDOT, DLAMCH, DLANGE + EXTERNAL LSAME, IDAMAX, DASUM, DDOT, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DAXPY, DSCAL, DTRSV, XERBLA @@ -304,6 +304,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -311,7 +312,6 @@ * SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -343,8 +343,67 @@ IF( TMAX.LE.BIGNUM ) THEN TSCAL = ONE ELSE - TSCAL = ONE / ( SMLNUM*TMAX ) - CALL DSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = ONE / ( SMLNUM*TMAX ) + CALL DSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be represented +* as floating-point number. Find the offdiagonal entry A( I, J ) +* with the largest absolute value. If this entry is not +/- Infinity, +* use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + $ TMAX ) + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1, + $ SUMJ ), TMAX ) + END DO + END IF +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm without introducing Infinity +* in the summation + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + END IF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point entry. +* Rely on TRSV to propagate Inf and NaN. + CALL DTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/slatbs.f b/lapack-netlib/SRC/slatbs.f index 617d0b2f5..77940f8cd 100644 --- a/lapack-netlib/SRC/slatbs.f +++ b/lapack-netlib/SRC/slatbs.f @@ -310,6 +310,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -317,7 +318,6 @@ * SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f index 94e0e88bc..0761d656f 100644 --- a/lapack-netlib/SRC/slatrs.f +++ b/lapack-netlib/SRC/slatrs.f @@ -264,8 +264,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX - REAL SASUM, SDOT, SLAMCH - EXTERNAL LSAME, ISAMAX, SASUM, SDOT, SLAMCH + REAL SASUM, SDOT, SLAMCH, SLANGE + EXTERNAL LSAME, ISAMAX, SASUM, SDOT, SLAMCH, SLANGE * .. * .. External Subroutines .. EXTERNAL SAXPY, SSCAL, STRSV, XERBLA @@ -304,6 +304,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -311,7 +312,6 @@ * SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -343,8 +343,67 @@ IF( TMAX.LE.BIGNUM ) THEN TSCAL = ONE ELSE - TSCAL = ONE / ( SMLNUM*TMAX ) - CALL SSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.SLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = ONE / ( SMLNUM*TMAX ) + CALL SSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be represented +* as floating-point number. Find the offdiagonal entry A( I, J ) +* with the largest absolute value. If this entry is not +/- Infinity, +* use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + $ TMAX ) + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1, + $ SUMJ ), TMAX ) + END DO + END IF +* + IF( TMAX.LE.SLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm without introducing Infinity +* in the summation + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + END IF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point entry. +* Rely on TRSV to propagate Inf and NaN. + CALL STRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/zlatbs.f b/lapack-netlib/SRC/zlatbs.f index b7b2cb8ae..bdffa1ea9 100644 --- a/lapack-netlib/SRC/zlatbs.f +++ b/lapack-netlib/SRC/zlatbs.f @@ -278,7 +278,7 @@ $ ZDOTU, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV, DLABAD + EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -324,17 +324,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = DLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / DLAMCH( 'Precision' ) + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/zlatrs.f b/lapack-netlib/SRC/zlatrs.f index 91bb688ec..2276ace87 100644 --- a/lapack-netlib/SRC/zlatrs.f +++ b/lapack-netlib/SRC/zlatrs.f @@ -274,7 +274,7 @@ $ ZDOTU, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV, DLABAD + EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -318,17 +318,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = DLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / DLAMCH( 'Precision' ) + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -360,8 +357,74 @@ IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE - TSCAL = HALF / ( SMLNUM*TMAX ) - CALL DSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.DLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = HALF / ( SMLNUM*TMAX ) + CALL DSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be +* represented as a floating-point number. Find the +* maximum offdiagonal absolute value +* max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is +* not +/- Infinity, use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + DO I = 1, J - 1 + TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ), + $ ABS( DIMAG(A ( I, J ) ) ) ) + END DO + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + DO I = J + 1, N + TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ), + $ ABS( DIMAG(A ( I, J ) ) ) ) + END DO + END DO + END IF +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm of each column without +* introducing Infinity in the summation. + TSCAL = TWO * TSCAL + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + END IF + TSCAL = TSCAL * HALF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point +* entry. Rely on TRSV to propagate Inf and NaN. + CALL ZTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the From e00f0fb26ac5ea52120db315b9c439515fd16572 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 22:46:58 +0100 Subject: [PATCH 129/154] Fix function documentation (Reference-LAPACK PR747) --- lapack-netlib/SRC/clarscl2.f | 10 +++++----- lapack-netlib/SRC/clascl2.f | 12 ++++++------ lapack-netlib/SRC/dlarscl2.f | 10 +++++----- lapack-netlib/SRC/dlascl2.f | 10 +++++----- lapack-netlib/SRC/slarscl2.f | 10 +++++----- lapack-netlib/SRC/slascl2.f | 10 +++++----- lapack-netlib/SRC/zlarscl2.f | 10 +++++----- lapack-netlib/SRC/zlascl2.f | 10 +++++----- 8 files changed, 41 insertions(+), 41 deletions(-) diff --git a/lapack-netlib/SRC/clarscl2.f b/lapack-netlib/SRC/clarscl2.f index 26b028dbb..f4e68523b 100644 --- a/lapack-netlib/SRC/clarscl2.f +++ b/lapack-netlib/SRC/clarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> CLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> CLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the REAL diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/clascl2.f b/lapack-netlib/SRC/clascl2.f index 2ae27975c..882273b5e 100644 --- a/lapack-netlib/SRC/clascl2.f +++ b/lapack-netlib/SRC/clascl2.f @@ -1,4 +1,4 @@ -*> \brief \b CLASCL2 performs diagonal scaling on a vector. +*> \brief \b CLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,9 +34,9 @@ *> *> \verbatim *> -*> CLASCL2 performs a diagonal scaling on a vector: +*> CLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x -*> where the diagonal REAL matrix D is stored as a vector. +*> where the diagonal REAL matrix D is stored as a matrix. *> *> Eventually to be replaced by BLAS_cge_diag_scale in the new BLAS *> standard. @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlarscl2.f b/lapack-netlib/SRC/dlarscl2.f index 2468e2702..cc4b9aa3c 100644 --- a/lapack-netlib/SRC/dlarscl2.f +++ b/lapack-netlib/SRC/dlarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> DLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> DLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is DOUBLE PRECISION array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlascl2.f b/lapack-netlib/SRC/dlascl2.f index 901e43c49..568e296ad 100644 --- a/lapack-netlib/SRC/dlascl2.f +++ b/lapack-netlib/SRC/dlascl2.f @@ -1,4 +1,4 @@ -*> \brief \b DLASCL2 performs diagonal scaling on a vector. +*> \brief \b DLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> DLASCL2 performs a diagonal scaling on a vector: +*> DLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is DOUBLE PRECISION array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slarscl2.f b/lapack-netlib/SRC/slarscl2.f index 5726f12cd..c7b77c908 100644 --- a/lapack-netlib/SRC/slarscl2.f +++ b/lapack-netlib/SRC/slarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> SLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> SLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is REAL array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slascl2.f b/lapack-netlib/SRC/slascl2.f index 07b506a8c..5efc1cfcd 100644 --- a/lapack-netlib/SRC/slascl2.f +++ b/lapack-netlib/SRC/slascl2.f @@ -1,4 +1,4 @@ -*> \brief \b SLASCL2 performs diagonal scaling on a vector. +*> \brief \b SLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> SLASCL2 performs a diagonal scaling on a vector: +*> SLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is REAL array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zlarscl2.f b/lapack-netlib/SRC/zlarscl2.f index 4a1e1603a..e61865906 100644 --- a/lapack-netlib/SRC/zlarscl2.f +++ b/lapack-netlib/SRC/zlarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> ZLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> ZLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX*16 array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zlascl2.f b/lapack-netlib/SRC/zlascl2.f index c4e6992fb..26406c363 100644 --- a/lapack-netlib/SRC/zlascl2.f +++ b/lapack-netlib/SRC/zlascl2.f @@ -1,4 +1,4 @@ -*> \brief \b ZLASCL2 performs diagonal scaling on a vector. +*> \brief \b ZLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> ZLASCL2 performs a diagonal scaling on a vector: +*> ZLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX*16 array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: From 7ae4269add1be1bff371c4e9d4d175ba7c630085 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Nov 2022 22:52:28 +0100 Subject: [PATCH 130/154] Use new algorithms for computing Givens rotations (Reference-LAPACK PR631) --- lapack-netlib/SRC/clartg.f90 | 159 +++++++++++++++++++++++----------- lapack-netlib/SRC/dlartg.f90 | 30 +++---- lapack-netlib/SRC/slartg.f90 | 30 +++---- lapack-netlib/SRC/zlartg.f90 | 161 ++++++++++++++++++++++++----------- 4 files changed, 247 insertions(+), 133 deletions(-) diff --git a/lapack-netlib/SRC/clartg.f90 b/lapack-netlib/SRC/clartg.f90 index 13a629a34..6231f8520 100644 --- a/lapack-netlib/SRC/clartg.f90 +++ b/lapack-netlib/SRC/clartg.f90 @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,19 +38,20 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be -!> identical to those returned by CLARTG. +!> identical to those returned by SLARTG. !> !> The algorithm used to compute these quantities incorporates scaling !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine CROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine CROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine CLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, two=>stwo, czero, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine CLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine CLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -149,30 +150,43 @@ subroutine CLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -181,32 +195,51 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,19 +247,43 @@ subroutine CLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return diff --git a/lapack-netlib/SRC/dlartg.f90 b/lapack-netlib/SRC/dlartg.f90 index ef8c6e386..b7049c32f 100644 --- a/lapack-netlib/SRC/dlartg.f90 +++ b/lapack-netlib/SRC/dlartg.f90 @@ -11,7 +11,7 @@ ! SUBROUTINE DLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C, F, G, R, S +! REAL(wp) C, F, G, R, S ! .. ! !> \par Purpose: @@ -45,8 +45,6 @@ !> floating point operations (saves work in DBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine DLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, half=>dhalf, one=>done, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine DLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -143,20 +145,18 @@ subroutine DLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/lapack-netlib/SRC/slartg.f90 b/lapack-netlib/SRC/slartg.f90 index a9af1aa8d..8a5a8f26a 100644 --- a/lapack-netlib/SRC/slartg.f90 +++ b/lapack-netlib/SRC/slartg.f90 @@ -35,7 +35,7 @@ !> square root of the sum of squares. !> !> This version is discontinuous in R at F = 0 but it returns the same -!> C and S as SLARTG for complex inputs (F,0) and (G,0). +!> C and S as CLARTG for complex inputs (F,0) and (G,0). !> !> This is a more accurate version of the BLAS1 routine SROTG, !> with the following other differences: @@ -45,8 +45,6 @@ !> floating point operations (saves work in SBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine SLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, half=>shalf, one=>sone, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine SLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -143,20 +145,18 @@ subroutine SLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/lapack-netlib/SRC/zlartg.f90 b/lapack-netlib/SRC/zlartg.f90 index 337a4dda8..a4f9bd4b0 100644 --- a/lapack-netlib/SRC/zlartg.f90 +++ b/lapack-netlib/SRC/zlartg.f90 @@ -11,8 +11,8 @@ ! SUBROUTINE ZLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C -! COMPLEX(wp) F, G, R, S +! REAL(wp) C +! COMPLEX(wp) F, G, R, S ! .. ! !> \par Purpose: @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,6 +38,10 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be !> identical to those returned by DLARTG. @@ -46,11 +50,8 @@ !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine ZROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine ZROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine ZLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, two=>dtwo, czero=>zzero, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine ZLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine ZLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -149,30 +150,43 @@ subroutine ZLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -181,32 +195,51 @@ subroutine ZLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,19 +247,43 @@ subroutine ZLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return From 50aba029107ef79a7c4a8836955cd743f1cf2e59 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Nov 2022 18:00:31 +0100 Subject: [PATCH 131/154] Simplify ?SYSWAPR and fix its documentation (Reference-LAPACK 217) --- lapack-netlib/SRC/csyswapr.f | 43 ++++++++++----------------------- lapack-netlib/SRC/dsyswapr.f | 47 ++++++++++++------------------------ lapack-netlib/SRC/ssyswapr.f | 47 ++++++++++++------------------------ lapack-netlib/SRC/zsyswapr.f | 47 ++++++++++++------------------------ 4 files changed, 58 insertions(+), 126 deletions(-) diff --git a/lapack-netlib/SRC/csyswapr.f b/lapack-netlib/SRC/csyswapr.f index 185d81922..04004f3c1 100644 --- a/lapack-netlib/SRC/csyswapr.f +++ b/lapack-netlib/SRC/csyswapr.f @@ -58,15 +58,13 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by CSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -116,7 +114,6 @@ * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I COMPLEX TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL CSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL CSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL CSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL CSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE CSYSWAPR diff --git a/lapack-netlib/SRC/dsyswapr.f b/lapack-netlib/SRC/dsyswapr.f index c60ccbefc..93f6195f2 100644 --- a/lapack-netlib/SRC/dsyswapr.f +++ b/lapack-netlib/SRC/dsyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is DOUBLE PRECISION array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by DSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is DOUBLE PRECISION array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, N ) + DOUBLE PRECISION A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I DOUBLE PRECISION TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL DSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL DSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL DSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL DSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE DSYSWAPR diff --git a/lapack-netlib/SRC/ssyswapr.f b/lapack-netlib/SRC/ssyswapr.f index 5e4265d7a..e1ab5a22a 100644 --- a/lapack-netlib/SRC/ssyswapr.f +++ b/lapack-netlib/SRC/ssyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is REAL array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by SSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is REAL array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - REAL A( LDA, N ) + REAL A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I REAL TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL SSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL SSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL SSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL SSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE SSYSWAPR diff --git a/lapack-netlib/SRC/zsyswapr.f b/lapack-netlib/SRC/zsyswapr.f index 1f1a87857..eb3c98c34 100644 --- a/lapack-netlib/SRC/zsyswapr.f +++ b/lapack-netlib/SRC/zsyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by ZSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is COMPLEX*16 array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, N ) + COMPLEX*16 A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I COMPLEX*16 TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL ZSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL ZSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL ZSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL ZSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE ZSYSWAPR From c45edcb537564999cffd53e81555927fd6ff7d7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Nov 2022 19:59:33 +0100 Subject: [PATCH 132/154] Fix typo in comment (Reference-LAPACK PR735) --- lapack-netlib/SRC/ieeeck.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/ieeeck.f b/lapack-netlib/SRC/ieeeck.f index 74065c3b4..f9f6332ec 100644 --- a/lapack-netlib/SRC/ieeeck.f +++ b/lapack-netlib/SRC/ieeeck.f @@ -41,7 +41,7 @@ *> \param[in] ISPEC *> \verbatim *> ISPEC is INTEGER -*> Specifies whether to test just for inifinity arithmetic +*> Specifies whether to test just for infinity arithmetic *> or whether to test for infinity and NaN arithmetic. *> = 0: Verify infinity arithmetic only. *> = 1: Verify infinity and NaN arithmetic. From f8f2bebf118880774fca7c2c443b3e088276e207 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Nov 2022 20:01:47 +0100 Subject: [PATCH 133/154] Fix function documentation for LAPACK ?TPRFB (Reference-LAPACK PR665) --- lapack-netlib/SRC/ctprfb.f | 2 +- lapack-netlib/SRC/dtprfb.f | 2 +- lapack-netlib/SRC/stprfb.f | 86 +++++++++++++++++++------------------- lapack-netlib/SRC/ztprfb.f | 2 +- 4 files changed, 46 insertions(+), 46 deletions(-) diff --git a/lapack-netlib/SRC/ctprfb.f b/lapack-netlib/SRC/ctprfb.f index 11496180f..6cd5f05bd 100644 --- a/lapack-netlib/SRC/ctprfb.f +++ b/lapack-netlib/SRC/ctprfb.f @@ -1,4 +1,4 @@ -*> \brief \b CTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b CTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/dtprfb.f b/lapack-netlib/SRC/dtprfb.f index a3fc7d6c6..c015075b3 100644 --- a/lapack-netlib/SRC/dtprfb.f +++ b/lapack-netlib/SRC/dtprfb.f @@ -1,4 +1,4 @@ -*> \brief \b DTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b DTPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/stprfb.f b/lapack-netlib/SRC/stprfb.f index 64e8b34f5..d91a80dfb 100644 --- a/lapack-netlib/SRC/stprfb.f +++ b/lapack-netlib/SRC/stprfb.f @@ -1,4 +1,4 @@ -*> \brief \b STPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b STPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * @@ -37,7 +37,7 @@ *> \verbatim *> *> STPRFB applies a real "triangular-pentagonal" block reflector H or its -*> conjugate transpose H^H to a real matrix C, which is composed of two +*> transpose H**T to a real matrix C, which is composed of two *> blocks A and B, either from the left or right. *> *> \endverbatim @@ -48,15 +48,15 @@ *> \param[in] SIDE *> \verbatim *> SIDE is CHARACTER*1 -*> = 'L': apply H or H^H from the Left -*> = 'R': apply H or H^H from the Right +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right *> \endverbatim *> *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 *> = 'N': apply H (No transpose) -*> = 'C': apply H^H (Conjugate transpose) +*> = 'T': apply H**T (Transpose) *> \endverbatim *> *> \param[in] DIRECT @@ -145,7 +145,7 @@ *> (LDA,N) if SIDE = 'L' or (LDA,K) if SIDE = 'R' *> On entry, the K-by-N or M-by-K matrix A. *> On exit, A is overwritten by the corresponding block of -*> H*C or H^H*C or C*H or C*H^H. See Further Details. +*> H*C or H**T*C or C*H or C*H**T. See Further Details. *> \endverbatim *> *> \param[in] LDA @@ -161,7 +161,7 @@ *> B is REAL array, dimension (LDB,N) *> On entry, the M-by-N matrix B. *> On exit, B is overwritten by the corresponding block of -*> H*C or H^H*C or C*H or C*H^H. See Further Details. +*> H*C or H**T*C or C*H or C*H**T. See Further Details. *> \endverbatim *> *> \param[in] LDB @@ -327,13 +327,13 @@ * Let W = [ I ] (K-by-K) * [ V ] (M-by-K) * -* Form H C or H^H C where C = [ A ] (K-by-N) -* [ B ] (M-by-N) +* Form H C or H**T C where C = [ A ] (K-by-N) +* [ B ] (M-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - T (A + V^H B) or A = A - T^H (A + V^H B) -* B = B - V T (A + V^H B) or B = B - V T^H (A + V^H B) +* A = A - T (A + V**T B) or A = A - T**T (A + V**T B) +* B = B - V T (A + V**T B) or B = B - V T**T (A + V**T B) * * --------------------------------------------------------------------------- * @@ -388,12 +388,12 @@ * Let W = [ I ] (K-by-K) * [ V ] (N-by-K) * -* Form C H or C H^H where C = [ A B ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ A B ] (A is M-by-K, B is M-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - (A + B V) T or A = A - (A + B V) T^H -* B = B - (A + B V) T V^H or B = B - (A + B V) T^H V^H +* A = A - (A + B V) T or A = A - (A + B V) T**T +* B = B - (A + B V) T V**T or B = B - (A + B V) T**T V**T * * --------------------------------------------------------------------------- * @@ -448,13 +448,13 @@ * Let W = [ V ] (M-by-K) * [ I ] (K-by-K) * -* Form H C or H^H C where C = [ B ] (M-by-N) -* [ A ] (K-by-N) +* Form H C or H**T C where C = [ B ] (M-by-N) +* [ A ] (K-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - T (A + V^H B) or A = A - T^H (A + V^H B) -* B = B - V T (A + V^H B) or B = B - V T^H (A + V^H B) +* A = A - T (A + V**T B) or A = A - T**T (A + V**T B) +* B = B - V T (A + V**T B) or B = B - V T**T (A + V**T B) * * --------------------------------------------------------------------------- * @@ -510,12 +510,12 @@ * Let W = [ V ] (N-by-K) * [ I ] (K-by-K) * -* Form C H or C H^H where C = [ B A ] (B is M-by-N, A is M-by-K) +* Form C H or C H**T where C = [ B A ] (B is M-by-N, A is M-by-K) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - (A + B V) T or A = A - (A + B V) T^H -* B = B - (A + B V) T V^H or B = B - (A + B V) T^H V^H +* A = A - (A + B V) T or A = A - (A + B V) T**T +* B = B - (A + B V) T V**T or B = B - (A + B V) T**T V**T * * --------------------------------------------------------------------------- * @@ -569,13 +569,13 @@ * * Let W = [ I V ] ( I is K-by-K, V is K-by-M ) * -* Form H C or H^H C where C = [ A ] (K-by-N) -* [ B ] (M-by-N) +* Form H C or H**T C where C = [ A ] (K-by-N) +* [ B ] (M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - T (A + V B) or A = A - T^H (A + V B) -* B = B - V^H T (A + V B) or B = B - V^H T^H (A + V B) +* A = A - T (A + V B) or A = A - T**T (A + V B) +* B = B - V**T T (A + V B) or B = B - V**T T**T (A + V B) * * --------------------------------------------------------------------------- * @@ -629,12 +629,12 @@ * * Let W = [ I V ] ( I is K-by-K, V is K-by-N ) * -* Form C H or C H^H where C = [ A B ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ A B ] (A is M-by-K, B is M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - (A + B V^H) T or A = A - (A + B V^H) T^H -* B = B - (A + B V^H) T V or B = B - (A + B V^H) T^H V +* A = A - (A + B V**T) T or A = A - (A + B V**T) T**T +* B = B - (A + B V**T) T V or B = B - (A + B V**T) T**T V * * --------------------------------------------------------------------------- * @@ -688,13 +688,13 @@ * * Let W = [ V I ] ( I is K-by-K, V is K-by-M ) * -* Form H C or H^H C where C = [ B ] (M-by-N) -* [ A ] (K-by-N) +* Form H C or H**T C where C = [ B ] (M-by-N) +* [ A ] (K-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - T (A + V B) or A = A - T^H (A + V B) -* B = B - V^H T (A + V B) or B = B - V^H T^H (A + V B) +* A = A - T (A + V B) or A = A - T**T (A + V B) +* B = B - V**T T (A + V B) or B = B - V**T T**T (A + V B) * * --------------------------------------------------------------------------- * @@ -748,12 +748,12 @@ * * Let W = [ V I ] ( I is K-by-K, V is K-by-N ) * -* Form C H or C H^H where C = [ B A ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ B A ] (A is M-by-K, B is M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - (A + B V^H) T or A = A - (A + B V^H) T^H -* B = B - (A + B V^H) T V or B = B - (A + B V^H) T^H V +* A = A - (A + B V**T) T or A = A - (A + B V**T) T**T +* B = B - (A + B V**T) T V or B = B - (A + B V**T) T**T V * * --------------------------------------------------------------------------- * diff --git a/lapack-netlib/SRC/ztprfb.f b/lapack-netlib/SRC/ztprfb.f index 2edbd0566..7b1bc17a0 100644 --- a/lapack-netlib/SRC/ztprfb.f +++ b/lapack-netlib/SRC/ztprfb.f @@ -1,4 +1,4 @@ -*> \brief \b ZTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b ZTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * From afcd7e88b6610bcd8dd504f43ce6fe545048242d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Nov 2022 21:18:39 +0100 Subject: [PATCH 134/154] Improve convergence of DLAED4/SLAED4 (Reference-LAPACK PR655) --- lapack-netlib/SRC/dlaed4.f | 7 +++++-- lapack-netlib/SRC/slaed4.f | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/dlaed4.f b/lapack-netlib/SRC/dlaed4.f index 3ee3ef920..b51e23d85 100644 --- a/lapack-netlib/SRC/dlaed4.f +++ b/lapack-netlib/SRC/dlaed4.f @@ -328,9 +328,12 @@ IF( C.LT.ZERO ) $ C = ABS( C ) IF( C.EQ.ZERO ) THEN -* ETA = B/A +* ETA = B/A * ETA = RHO - TAU - ETA = DLTUB - TAU +* ETA = DLTUB - TAU +* +* Update proposed by Li, Ren-Cang: + ETA = -W / ( DPSI+DPHI ) ELSE IF( A.GE.ZERO ) THEN ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C ) ELSE diff --git a/lapack-netlib/SRC/slaed4.f b/lapack-netlib/SRC/slaed4.f index f056746d8..339c5029c 100644 --- a/lapack-netlib/SRC/slaed4.f +++ b/lapack-netlib/SRC/slaed4.f @@ -328,9 +328,12 @@ IF( C.LT.ZERO ) $ C = ABS( C ) IF( C.EQ.ZERO ) THEN -* ETA = B/A +* ETA = B/A * ETA = RHO - TAU - ETA = DLTUB - TAU +* ETA = DLTUB - TAU +* +* Update proposed by Li, Ren-Cang: + ETA = -W / ( DPSI+DPHI ) ELSE IF( A.GE.ZERO ) THEN ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C ) ELSE From d3213575586c4fa3d3d5654247a184d2ddece7e3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Nov 2022 21:19:44 +0100 Subject: [PATCH 135/154] Fix bug in DORCSD2BY1 (from Reference-LAPACK PR697) --- lapack-netlib/SRC/dorcsd2by1.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/dorcsd2by1.f b/lapack-netlib/SRC/dorcsd2by1.f index 06bf53db1..25fab0f33 100644 --- a/lapack-netlib/SRC/dorcsd2by1.f +++ b/lapack-netlib/SRC/dorcsd2by1.f @@ -580,7 +580,7 @@ * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( JOBV1T, 'N', JOBU1, JOBU2, 'T', M, Q, P, THETA, - $ WORK(IPHI), V1T, LDV1T, DUM2, 1, U1, LDU1, U2, + $ WORK(IPHI), V1T, LDV1T, DUM1, 1, U1, LDU1, U2, $ LDU2, WORK(IB11D), WORK(IB11E), WORK(IB12D), $ WORK(IB12E), WORK(IB21D), WORK(IB21E), $ WORK(IB22D), WORK(IB22E), WORK(IBBCSD), LBBCSD, @@ -635,7 +635,7 @@ * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( 'N', JOBV1T, JOBU2, JOBU1, 'T', M, M-Q, M-P, - $ THETA, WORK(IPHI), DUM2, 1, V1T, LDV1T, U2, + $ THETA, WORK(IPHI), DUM1, 1, V1T, LDV1T, U2, $ LDU2, U1, LDU1, WORK(IB11D), WORK(IB11E), $ WORK(IB12D), WORK(IB12E), WORK(IB21D), $ WORK(IB21E), WORK(IB22D), WORK(IB22E), @@ -706,7 +706,7 @@ * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( JOBU2, JOBU1, 'N', JOBV1T, 'N', M, M-P, M-Q, - $ THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM2, + $ THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM1, $ 1, V1T, LDV1T, WORK(IB11D), WORK(IB11E), $ WORK(IB12D), WORK(IB12E), WORK(IB21D), $ WORK(IB21E), WORK(IB22D), WORK(IB22E), From 8408357bab29276844378ed360b5e64e06b5e8ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Nov 2022 14:01:48 +0100 Subject: [PATCH 136/154] Update LAPACK version number to 3.11.0 --- lapack-netlib/INSTALL/ilaver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c index 83ef3e0d8..b274af292 100644 --- a/lapack-netlib/INSTALL/ilaver.c +++ b/lapack-netlib/INSTALL/ilaver.c @@ -573,7 +573,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* ===================================================================== */ *vers_major__ = 3; - *vers_minor__ = 9; + *vers_minor__ = 11; *vers_patch__ = 0; /* ===================================================================== */ From e6e2a63650bf6f789693fd5ec788d3f35371196c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Nov 2022 14:02:21 +0100 Subject: [PATCH 137/154] Update LAPACK version number to 3.11.0 --- lapack-netlib/INSTALL/ilaver.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index 79fe597ae..a246c37cb 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -60,7 +60,7 @@ INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 9 + VERS_MINOR = 11 VERS_PATCH = 0 * ===================================================================== * From 0b2f8dabbf9e6c4de8e4b62b3a8df96097a2c23f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:30:35 +0100 Subject: [PATCH 138/154] Fix array dimension (Reference-LAPACK 758) --- lapack-netlib/TESTING/EIG/csyl01.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f index e21f1a7a0..82d790daa 100644 --- a/lapack-netlib/TESTING/EIG/csyl01.f +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -124,7 +124,7 @@ $ C( MAXM, MAXN ), CC( MAXM, MAXN ), $ X( MAXM, MAXN ), $ DUML( MAXM ), DUMR( MAXN ), - $ D( MIN( MAXM, MAXN ) ) + $ D( MAX( MAXM, MAXN ) ) REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) * .. From bc3393f703dadb95b7d69ca61a74bff793841fc5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:31:18 +0100 Subject: [PATCH 139/154] Fix array dimension (Reference-LAPACK 758) --- lapack-netlib/TESTING/EIG/zsyl01.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f index 1e8619a34..329f39dc4 100644 --- a/lapack-netlib/TESTING/EIG/zsyl01.f +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -124,7 +124,7 @@ $ C( MAXM, MAXN ), CC( MAXM, MAXN ), $ X( MAXM, MAXN ), $ DUML( MAXM ), DUMR( MAXN ), - $ D( MIN( MAXM, MAXN ) ) + $ D( MAX( MAXM, MAXN ) ) DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) * .. From 730ed549e6f0cfced4c4874da012baf1e464fdeb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:35:23 +0100 Subject: [PATCH 140/154] Fix typo in EXTERNAL (Reference-LAPACK PR760) --- lapack-netlib/TESTING/EIG/derred.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f index 6df517825..11a932052 100644 --- a/lapack-netlib/TESTING/EIG/derred.f +++ b/lapack-netlib/TESTING/EIG/derred.f @@ -99,7 +99,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, DGEES, DGEESX, DGEEV, DGEEVX, DGEJSV, - $ DGESDD, DGESVD, DGESVDX, DGESVQ + $ DGESDD, DGESVD, DGESVDX, DGESVDQ * .. * .. External Functions .. LOGICAL DSLECT, LSAMEN From 825ae316e2195997349958479ce27da01e9fb77e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:36:10 +0100 Subject: [PATCH 141/154] Fix typo in EXTERNAL (Reference-LAPACK PR760) --- lapack-netlib/TESTING/EIG/zerred.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f index d1219c02b..1876c1f1d 100644 --- a/lapack-netlib/TESTING/EIG/zerred.f +++ b/lapack-netlib/TESTING/EIG/zerred.f @@ -100,7 +100,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, ZGEES, ZGEESX, ZGEEV, ZGEEVX, ZGESVJ, - $ ZGESDD, ZGESVD, ZGESVDX, ZGESVQ + $ ZGESDD, ZGESVD, ZGESVDX, ZGESVDQ * .. * .. External Functions .. LOGICAL LSAMEN, ZSLECT From 7694ff495f26c29cc24771a29ccd13f4191a2baf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:40:59 +0100 Subject: [PATCH 142/154] Remove unnecessary return in void function call (Reference-LAPACK PR760) --- lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c index 8910aee7d..48d346611 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c @@ -147,7 +147,7 @@ void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, } /* Copy & transpose triangular part */ - return LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n, - &in[tri_in_offset], ldin, - &out[tri_out_offset], ldout ); + LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); } From d952cbf7bc23c1cb8fb7d2bdf4ddd888f4860905 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:41:50 +0100 Subject: [PATCH 143/154] Remove unnecessary return in void function call (Reference-LAPACK PR760) --- lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c index 80d94ead9..b39000d42 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c @@ -147,7 +147,7 @@ void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, } /* Copy & transpose triangular part */ - return LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n, - &in[tri_in_offset], ldin, - &out[tri_out_offset], ldout ); + LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); } From 74962c7f53110d915b4597344d2b397c150a9936 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:42:29 +0100 Subject: [PATCH 144/154] Remove unnecessary return in void function call (Reference-LAPACK PR760) --- lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c index 793f3833d..cffee6c98 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c @@ -147,7 +147,7 @@ void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, } /* Copy & transpose triangular part */ - return LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n, - &in[tri_in_offset], ldin, - &out[tri_out_offset], ldout ); + LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); } From c2ba4e6249e11661d9b8b5f0717a8256a1068143 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 10:43:34 +0100 Subject: [PATCH 145/154] Remove unnecessary return in void function call (Reference-LAPACK PR760) --- lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c index 881052331..faef6da50 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c @@ -147,7 +147,7 @@ void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, } /* Copy & transpose triangular part */ - return LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n, - &in[tri_in_offset], ldin, - &out[tri_out_offset], ldout ); + LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); } From 19fd2d7f00325b846a1693eeb57ba2ad21ead8d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 15:19:07 +0100 Subject: [PATCH 146/154] Use LSAME for character comparison (Reference-LAPACK PR755) --- lapack-netlib/SRC/iparam2stage.F | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F index c153eef22..c701c2be0 100644 --- a/lapack-netlib/SRC/iparam2stage.F +++ b/lapack-netlib/SRC/iparam2stage.F @@ -178,7 +178,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + LOGICAL LSAME + EXTERNAL ILAENV, LSAME * .. * .. Executable Statements .. * @@ -310,7 +311,7 @@ * * Will add the VECT OPTION HERE next release VECT = OPTS(1:1) - IF( VECT.EQ.'N' ) THEN + IF( LSAME( VECT, 'N' ) ) THEN LHOUS = MAX( 1, 4*NI ) ELSE * This is not correct, it need to call the ALGO and the stage2 From 0d26f1a4c7dc7e3829b3bde8f7d2e3a93f9fbbd3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 15:22:27 +0100 Subject: [PATCH 147/154] Fix wrong indexation in test (Reference-LAPACK PR755) --- lapack-netlib/TESTING/LIN/schktr.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f index 5aeb1ce88..33f07726e 100644 --- a/lapack-netlib/TESTING/LIN/schktr.f +++ b/lapack-netlib/TESTING/LIN/schktr.f @@ -559,7 +559,7 @@ $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) * CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, - $ SCALE3 ( 1 ), RWORK, ONE, B( N+1 ), LDA, + $ SCALE3 ( 1 ), RWORK, ONE, B( 1 ), LDA, $ X, LDA, WORK, RESULT( 10 ) ) CALL SSCAL( N, BIGNUM, X, 1 ) CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, From b0393ea4e17d4910043224791999d972f031cfde Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 15:27:46 +0100 Subject: [PATCH 148/154] Fix test (Reference-LAPACK PR764) --- lapack-netlib/TESTING/LIN/schktr.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f index 33f07726e..92d876108 100644 --- a/lapack-netlib/TESTING/LIN/schktr.f +++ b/lapack-netlib/TESTING/LIN/schktr.f @@ -555,11 +555,11 @@ * IF( INFO.NE.0 ) $ CALL ALAERH( PATH, 'SLATRS3', INFO, 0, - $ UPLO // TRANS // DIAG // 'Y', N, N, + $ UPLO // TRANS // DIAG // 'N', N, N, $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) * CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, - $ SCALE3 ( 1 ), RWORK, ONE, B( 1 ), LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, $ X, LDA, WORK, RESULT( 10 ) ) CALL SSCAL( N, BIGNUM, X, 1 ) CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, From a5470521ee4737060da10ee6bd97d229a19d49e9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Nov 2022 15:31:25 +0100 Subject: [PATCH 149/154] Fix array indexation in copy, and fix test (Reference-LAPACK PR764) --- lapack-netlib/TESTING/LIN/cchktr.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index c55b07643..4b09361d8 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -541,7 +541,7 @@ * SRNAMT = 'CLATRS3' CALL CCOPY( N, X, 1, B, 1 ) - CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B( N+1 ), 1 ) CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, @@ -551,7 +551,7 @@ * IF( INFO.NE.0 ) $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, - $ UPLO // TRANS // DIAG // 'Y', N, N, + $ UPLO // TRANS // DIAG // 'N', N, N, $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, @@ -559,7 +559,7 @@ CALL CSSCAL( N, BIGNUM, X, 1 ) CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, - $ X, LDA, WORK, RESULT( 10 ) ) + $ X, LDA, WORK, RES ) RESULT( 10 ) = MAX( RESULT( 10 ), RES ) * * Print information about the tests that did not pass From 4f7b77e08aa77f6e907e858f3160ecbffe27027e Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Fri, 25 Nov 2022 15:24:32 +0000 Subject: [PATCH 150/154] Remove unnecessary instructions from Advanced SIMD dot The existing kernel was issuing extra instructions to organise the arguments into the same registers they would usually be in and similarly to put the result into the appropriate register. This has an impact on smaller sized dots and seemed like a quick fix --- kernel/arm64/dot_thunderx2t99.c | 247 +++++++++++++++----------------- 1 file changed, 118 insertions(+), 129 deletions(-) diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 3940acddd..9131f1e86 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -1,5 +1,6 @@ /*************************************************************************** Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RETURN_TYPE double #endif -#define N "x0" /* vector length */ -#define X "x1" /* "X" vector address */ -#define INC_X "x2" /* "X" stride */ -#define Y "x3" /* "Y" vector address */ -#define INC_Y "x4" /* "Y" stride */ -#define J "x5" /* loop variable */ - #if !defined(DOUBLE) #if !defined(DSDOT) +#define DOT_MOD "s" #define REG0 "wzr" -#define DOTF "s0" #define TMPX "s16" #define TMPY "s24" #define INC_SHIFT "2" #define N_DIV_SHIFT "6" #define N_REM_MASK "63" #else +#define DOT_MOD "d" #define REG0 "xzr" -#define DOTF "d0" #define TMPX "s16" #define TMPX1 "d2" #define TMPY "s24" @@ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N_REM_MASK "15" #endif #else +#define DOT_MOD "d" #define REG0 "xzr" -#define DOTF "d0" #define TMPX "d16" #define TMPY "d24" #define INC_SHIFT "3" @@ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N_REM_MASK "31" #endif +#define OUT "%"DOT_MOD"[DOT_]" + #if !defined(DOUBLE) #if !defined(DSDOT) #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ - " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ + " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" #define KERNEL_F \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" @@ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /* !defined(DSDOT) */ #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ " fcvt "TMPX1", "TMPX" \n" \ " fcvt "TMPY1", "TMPY" \n" \ " fmul "TMPX1", "TMPX1", "TMPY1" \n" \ - " fadd "DOTF", "DOTF", "TMPX1" \n" + " fadd "OUT", "OUT", "TMPX1" \n" #define KERNEL_F \ - " ldp q18, q19, ["X"] \n" \ - " ldp q26, q27, ["Y"] \n" \ + " ldp q18, q19, [%[X_]] \n" \ + " ldp q26, q27, [%[Y_]] \n" \ " fcvtl v16.2d, v18.2s \n" \ " fcvtl2 v17.2d, v18.4s \n" \ " fcvtl v18.2d, v19.2s \n" \ @@ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fcvtl2 v25.2d, v26.4s \n" \ " fcvtl v26.2d, v27.2s \n" \ " fcvtl2 v27.2d, v27.4s \n" \ - " ldp q22, q23, ["X", #32] \n" \ - " ldp q30, q31, ["Y", #32] \n" \ + " ldp q22, q23, [%[X_], #32] \n" \ + " ldp q30, q31, [%[Y_], #32] \n" \ " fcvtl v20.2d, v22.2s \n" \ " fcvtl2 v21.2d, v22.4s \n" \ " fcvtl v22.2d, v23.2s \n" \ @@ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fcvtl2 v29.2d, v30.4s \n" \ " fcvtl v30.2d, v31.2s \n" \ " fcvtl2 v31.2d, v31.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " add "Y", "Y", #64 \n" \ - " add "X", "X", #64 \n" \ + " add %[Y_], %[Y_], #64 \n" \ + " add %[X_], %[X_], #64 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ @@ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ - " faddp "DOTF", v0.2d \n" + " faddp "OUT", v0.2d \n" #endif /* !defined(DSDOT) */ #else /* !defined(DOUBLE) */ #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ - " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ + " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" #define KERNEL_F \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" @@ -261,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ - " faddp "DOTF", v0.2d \n" + " faddp "OUT", v0.2d \n" #endif /* !defined(DOUBLE) */ #if defined(SMP) @@ -272,17 +268,14 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - RETURN_TYPE dot = 0.0 ; + RETURN_TYPE dot = 0.0; if ( n < 0 ) return dot; + BLASLONG j = 0; + __asm__ __volatile__ ( - " mov "N", %[N_] \n" - " mov "X", %[X_] \n" - " mov "INC_X", %[INCX_] \n" - " mov "Y", %[Y_] \n" - " mov "INC_Y", %[INCY_] \n" - " fmov "DOTF", "REG0" \n" + " fmov "OUT", "REG0" \n" " fmov d1, xzr \n" " fmov d2, xzr \n" " fmov d3, xzr \n" @@ -290,42 +283,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" - " cmp "N", xzr \n" - " ble 9f //dot_kernel_L999 \n" - " cmp "INC_X", #1 \n" + " cmp %[INCX_], #1 \n" " bne 5f //dot_kernel_S_BEGIN \n" - " cmp "INC_Y", #1 \n" + " cmp %[INCY_], #1 \n" " bne 5f //dot_kernel_S_BEGIN \n" "1: //dot_kernel_F_BEGIN: \n" - " lsl "INC_X", "INC_X", "INC_SHIFT" \n" - " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" - " asr "J", "N", #"N_DIV_SHIFT" \n" - " cmp "J", xzr \n" + " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" + " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" + " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" + " cmp %[J_], xzr \n" " beq 3f //dot_kernel_F1 \n" " .align 5 \n" "2: //dot_kernel_F: \n" " "KERNEL_F" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" "3: //dot_kernel_F1: \n" - " ands "J", "N", #"N_REM_MASK" \n" + " ands %[J_], %[N_], #"N_REM_MASK" \n" " ble 9f //dot_kernel_L999 \n" "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 4b //dot_kernel_F10 \n" " b 9f //dot_kernel_L999 \n" "5: //dot_kernel_S_BEGIN: \n" - " lsl "INC_X", "INC_X", "INC_SHIFT" \n" - " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" - " asr "J", "N", #2 \n" - " cmp "J", xzr \n" + " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" + " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" + " asr %[J_], %[N_], #2 \n" + " cmp %[J_], xzr \n" " ble 7f //dot_kernel_S1 \n" "6: //dot_kernel_S4: \n" @@ -333,32 +324,30 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 6b //dot_kernel_S4 \n" "7: //dot_kernel_S1: \n" - " ands "J", "N", #3 \n" + " ands %[J_], %[N_], #3 \n" " ble 9f //dot_kernel_L999 \n" "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 8b //dot_kernel_S10 \n" "9: //dot_kernel_L999: \n" - " str "DOTF", [%[DOT_]] \n" - - : - : [DOT_] "r" (&dot), //%0 - [N_] "r" (n), //%1 - [X_] "r" (x), //%2 - [INCX_] "r" (inc_x), //%3 - [Y_] "r" (y), //%4 - [INCY_] "r" (inc_y) //%5 + + : [DOT_] "=&w" (dot) + : [N_] "r" (n), + [X_] "r" (x), + [INCX_] "r" (inc_x), + [Y_] "r" (y), + [INCY_] "r" (inc_y), + [J_] "r" (j) : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return dot; From bae45d94d13ee0e4b8e041b4a10d82198298409d Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Tue, 29 Nov 2022 08:02:45 -0500 Subject: [PATCH 151/154] scal benchmark: eliminate y, move init/timing out of loop Removing y avoids cache effects (if y is the size of the L1 cache, the main array x is removed from it). Moving init and timing out of the loop makes the scal benchmark behave like the gemm benchmark, and allows higher accuracy for smaller test cases since the loop overhead is much smaller than the timing overhead. Example: OPENBLAS_LOOPS=10000 ./dscal.goto 1024 8192 1024 on AMD Zen2 (7532) with 32k (4k doubles) L1 cache per core. Before From : 1024 To : 8192 Step = 1024 Inc_x = 1 Inc_y = 1 Loops = 10000 SIZE Flops 1024 : 5627.08 MFlops 0.000000 sec 2048 : 5907.34 MFlops 0.000000 sec 3072 : 5553.30 MFlops 0.000001 sec 4096 : 5446.38 MFlops 0.000001 sec 5120 : 5504.61 MFlops 0.000001 sec 6144 : 5501.80 MFlops 0.000001 sec 7168 : 5547.43 MFlops 0.000001 sec 8192 : 5548.46 MFlops 0.000001 sec After From : 1024 To : 8192 Step = 1024 Inc_x = 1 Inc_y = 1 Loops = 10000 SIZE Flops 1024 : 6310.28 MFlops 0.000000 sec 2048 : 6396.29 MFlops 0.000000 sec 3072 : 6439.14 MFlops 0.000000 sec 4096 : 6327.14 MFlops 0.000001 sec 5120 : 5628.24 MFlops 0.000001 sec 6144 : 5616.41 MFlops 0.000001 sec 7168 : 5553.13 MFlops 0.000001 sec 8192 : 5600.88 MFlops 0.000001 sec We can see the L1->L2 switchover point is now where it should be, and the number of flops for L1 is more accurate. --- benchmark/scal.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/benchmark/scal.c b/benchmark/scal.c index 8de6cfd04..79bcb6729 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main(int argc, char *argv[]){ - FLOAT *x, *y; + FLOAT *x; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; @@ -74,10 +74,6 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - #ifdef __linux srandom(getpid()); #endif @@ -91,30 +87,20 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + begin(); for (l=0; l Date: Fri, 25 Nov 2022 14:46:24 +0000 Subject: [PATCH 152/154] Set SWITCH_RATIO for Arm(R) Neoverse(TM) V1 CPUs From testing this yields better results than the default of `2`. --- param.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/param.h b/param.h index 514b13a3a..19cbe75a5 100644 --- a/param.h +++ b/param.h @@ -3367,6 +3367,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) +#define SWITCH_RATIO 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From fd4f52c797328b4b134f62b5e97bbbba3e79426a Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 17 Nov 2022 06:49:57 +0000 Subject: [PATCH 153/154] Add SVE implementation for sdot/ddot This adds an SVE implementation to sdot/ddot when available, falling back to the previous Advanced SIMD kernel where there's no SVE implementation for the kernel. All the targets were essentially treating `dot_thunderx2t99.c` as the Advanced SIMD implementation so I've renamed it to better fit with the feature detection. --- Makefile.arm64 | 10 +- getarch.c | 2 +- kernel/arm64/KERNEL.NEOVERSEN1 | 4 +- kernel/arm64/KERNEL.NEOVERSEN2 | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 4 +- kernel/arm64/KERNEL.THUNDERX2T99 | 4 +- kernel/arm64/KERNEL.THUNDERX3T110 | 4 +- kernel/arm64/dot.c | 111 ++++++++++++++++++ ...{dot_thunderx2t99.c => dot_kernel_asimd.c} | 66 +---------- kernel/arm64/dot_kernel_sve.c | 66 +++++++++++ 10 files changed, 194 insertions(+), 81 deletions(-) create mode 100644 kernel/arm64/dot.c rename kernel/arm64/{dot_thunderx2t99.c => dot_kernel_asimd.c} (87%) create mode 100644 kernel/arm64/dot_kernel_sve.c diff --git a/Makefile.arm64 b/Makefile.arm64 index e2c471c2b..fc986f4c0 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -70,12 +70,12 @@ endif ifeq ($(CORE), NEOVERSEN1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ9), 1) -CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif @@ -94,12 +94,12 @@ ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ10), 1) ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) -CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=native endif @@ -133,7 +133,7 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a -mtune=native endif diff --git a/getarch.c b/getarch.c index cde5b4e83..f26ca6325 100644 --- a/getarch.c +++ b/getarch.c @@ -1410,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ - "-march=armv8.4-a -mtune=neoverse-v1" + "-march=armv8.4-a+sve -mtune=neoverse-v1" #define LIBNAME "neoversev1" #define CORENAME "NEOVERSEV1" #endif diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index ea010db42..9a5938459 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index ae386d6e1..b743d1a43 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index ea010db42..9a5938459 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a20d0d4a6..41cedc851 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index a20d0d4a6..41cedc851 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c new file mode 100644 index 000000000..094bce696 --- /dev/null +++ b/kernel/arm64/dot.c @@ -0,0 +1,111 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2022, Arm Ltd +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#ifdef HAVE_SVE +#include "dot_kernel_sve.c" +#endif +#include "dot_kernel_asimd.c" + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + RETURN_TYPE dot = 0.0 ; + + if ( n <= 0 ) return dot; + +#ifdef HAVE_SVE + if (inc_x == 1 && inc_y == 1) { + return dot_kernel_sve(n, x, y); + } +#endif + + return dot_kernel_asimd(n, x, inc_x, y, inc_y); +} + +#if defined(SMP) +static int dot_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); + + return 0; +} +#endif + +RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + RETURN_TYPE dot = 0.0; + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + dot = dot_compute(n, x, inc_x, y, inc_y); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + RETURN_TYPE *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)dot_thread_function, nthreads); + + ptr = (RETURN_TYPE *)result; + for (i = 0; i < nthreads; i++) { + dot = dot + (*ptr); + ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot = dot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot; +} diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_kernel_asimd.c similarity index 87% rename from kernel/arm64/dot_thunderx2t99.c rename to kernel/arm64/dot_kernel_asimd.c index 9131f1e86..1288838f8 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_kernel_asimd.c @@ -260,18 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " faddp "OUT", v0.2d \n" #endif /* !defined(DOUBLE) */ -#if defined(SMP) -extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, - BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); -#endif - -static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { RETURN_TYPE dot = 0.0; - - if ( n < 0 ) return dot; - BLASLONG j = 0; __asm__ __volatile__ ( @@ -352,58 +343,3 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B return dot; } - -#if defined(SMP) -static int dot_thread_function(BLASLONG n, BLASLONG dummy0, - BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) -{ - *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); - - return 0; -} -#endif - -RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ -#if defined(SMP) - int nthreads; - FLOAT dummy_alpha; -#endif - RETURN_TYPE dot = 0.0; - -#if defined(SMP) - if (inc_x == 0 || inc_y == 0 || n <= 10000) - nthreads = 1; - else - nthreads = num_cpu_avail(1); - - if (nthreads == 1) { - dot = dot_compute(n, x, inc_x, y, inc_y); - } else { - int mode, i; - char result[MAX_CPU_NUMBER * sizeof(double) * 2]; - RETURN_TYPE *ptr; - -#if !defined(DOUBLE) - mode = BLAS_SINGLE | BLAS_REAL; -#else - mode = BLAS_DOUBLE | BLAS_REAL; -#endif - - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, - x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); - - ptr = (RETURN_TYPE *)result; - for (i = 0; i < nthreads; i++) { - dot = dot + (*ptr); - ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); - } - } -#else - dot = dot_compute(n, x, inc_x, y, inc_y); -#endif - - return dot; -} diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c new file mode 100644 index 000000000..8460e0d5e --- /dev/null +++ b/kernel/arm64/dot_kernel_sve.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2022, Arm Ltd +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { + SVE_TYPE acc_a = SVE_ZERO; + SVE_TYPE acc_b = SVE_ZERO; + + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) { + svbool_t pg_a = SVE_WHILELT(i, n); + svbool_t pg_b = SVE_WHILELT(i + sve_width, n); + + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + + acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); + acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); + } + + return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); +} From eea006a6886856bc1b89817052a8c471234d9c6b Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 29 Nov 2022 17:53:38 +0000 Subject: [PATCH 154/154] Wrap SVE header with __has_include check --- kernel/arm64/dot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c index 094bce696..4607ebc59 100644 --- a/kernel/arm64/dot.c +++ b/kernel/arm64/dot.c @@ -29,7 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +// Some compilers will report feature support for SVE without the appropriate +// header available #ifdef HAVE_SVE +#if defined __has_include +#if __has_include() && __ARM_FEATURE_SVE +#define USE_SVE +#endif +#endif +#endif + +#ifdef USE_SVE #include "dot_kernel_sve.c" #endif #include "dot_kernel_asimd.c" @@ -46,7 +56,7 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B if ( n <= 0 ) return dot; -#ifdef HAVE_SVE +#ifdef USE_SVE if (inc_x == 1 && inc_y == 1) { return dot_kernel_sve(n, x, y); }